ExtensionCrawler/ExtensionCrawler/config.py

151 lines
5.0 KiB
Python

#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Global configuration of the Extension Crawler and related tools."""
import os
import json
def const_sitemap_url():
"""Sitemap URL."""
return "https://chrome.google.com/webstore/sitemap"
def const_sitemap_scheme():
"""URL of Sitemap schema."""
return "http://www.sitemaps.org/schemas/sitemap/0.9"
def const_overview_url(ext_id):
"""URL template for the overview page of an extension."""
return 'https://chrome.google.com/webstore/detail/{}'.format(ext_id)
def const_store_url():
"""Main URL of the Chrome store."""
return 'https://chrome.google.com/webstore'
def const_review_url():
"""Base URL of the review page of an extension."""
return 'https://chrome.google.com/reviews/components'
def const_review_search_url():
"""Base URL for review search."""
return 'https://chrome.google.com/reviews/json/search'
def const_support_url():
"""Base URL for support pages."""
return 'https://chrome.google.com/reviews/components'
def const_download_url():
"""Base download URL."""
return ('https://clients2.google.com/service/update2/' +
'crx?response=redirect&nacl_arch=x86-64&' +
'prodversion=9999.0.9999.0&x=id%3D{}%26uc')
def const_categories():
"""List of known categories."""
return [
'extensions', 'ext/22-accessibility', 'ext/10-blogging',
'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
'ext/13-sports'
]
def const_support_payload(ext_id, start, end):
"""Payload for requesting support pages."""
return (
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
'"specs":[{{"type":"CommentThread",' +
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+ '"groups":"chrome_webstore_support",' + '"startindex":"{}",' +
'"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' +
'"internedValues":[]}}').format(ext_id, start, end)
def const_review_payload(ext_id, start, end):
"""Payload for requesting review pages."""
return (
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
'"specs":[{{"type":"CommentThread",' +
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+ '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' +
'"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
'"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
end)
def const_review_search_payload(params):
"""Payload for searches."""
pre = """req={"applicationId":94,"searchSpecs":["""
post = """]}&requestSource=widget"""
args = []
for extid, author, start, numresults, groups in params:
args += [
"""{{"requireComment":true,"entities":[{{"annotation":"""
"""{{"groups":{},"author":"{}","""
""""url":"http://chrome.google.com/extensions/permalink?id={}"}}}}],"""
""""matchExtraGroups":true,"startIndex":{},"numResults":{},"""
""""includeNicknames":true,"locale": {{"language": "en","country": "us"}}}}"""
.format(json.dumps(groups), author, extid, start, numresults)
]
return pre + ",".join(args) + post
def get_local_archive_dir(ext_id):
"""Local archive dir of extension."""
return "{}".format(ext_id[:3])
def archive_file(archivedir, ext_id):
"""Archive tar of an extension."""
return os.path.join(
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
def db_file(archivedir, ext_id):
"""DB (sqlite) file of an extension."""
return os.path.join(archivedir,
get_local_archive_dir(ext_id), ext_id + ".sqlite")
def jsloc_timeout():
"""Maximum number of seconds for counting jsloc per extension."""
return 600
def const_basedir():
"""Top-level directory for the extension crawler archive."""
return "archive"
def const_parallel_downloads():
"""Number of parallel downloads."""
return 36
def const_verbose():
"""Default verbosity."""
return True
def const_discover():
"""Default configuration of discovery mode"""
return False