ExtensionCrawler/ExtensionCrawler/config.py

175 lines
5.3 KiB
Python
Raw Permalink Normal View History

2019-01-16 07:23:18 +00:00
#!/usr/bin/env python3.7
2017-01-20 23:02:56 +00:00
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
2017-06-20 07:10:28 +00:00
#
2017-01-20 23:02:56 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
2017-07-29 09:57:35 +00:00
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Global configuration of the Extension Crawler and related tools."""
2017-01-20 23:02:56 +00:00
import os
import json
2017-01-27 22:40:07 +00:00
2017-06-16 22:19:13 +00:00
2017-01-20 23:02:56 +00:00
def const_sitemap_url():
2017-07-29 09:57:35 +00:00
"""Sitemap URL."""
2017-01-20 23:02:56 +00:00
return "https://chrome.google.com/webstore/sitemap"
2017-01-27 22:40:07 +00:00
2017-01-20 23:02:56 +00:00
def const_sitemap_scheme():
2017-07-29 09:57:35 +00:00
"""URL of Sitemap schema."""
2017-01-20 23:02:56 +00:00
return "http://www.sitemaps.org/schemas/sitemap/0.9"
2017-01-27 22:40:07 +00:00
2017-07-29 09:57:35 +00:00
def const_overview_url(ext_id):
"""URL template for the overview page of an extension."""
return 'https://chrome.google.com/webstore/detail/{}'.format(ext_id)
2017-01-27 22:40:07 +00:00
def const_store_url():
2017-07-29 09:57:35 +00:00
"""Main URL of the Chrome store."""
2017-01-27 22:40:07 +00:00
return 'https://chrome.google.com/webstore'
def const_review_url():
2017-07-29 09:57:35 +00:00
"""Base URL of the review page of an extension."""
2017-01-27 22:40:07 +00:00
return 'https://chrome.google.com/reviews/components'
2017-07-12 15:10:47 +00:00
def const_review_search_url():
2017-07-29 09:57:35 +00:00
"""Base URL for review search."""
2017-07-12 15:10:47 +00:00
return 'https://chrome.google.com/reviews/json/search'
2017-01-27 22:40:07 +00:00
def const_support_url():
2017-07-29 09:57:35 +00:00
"""Base URL for support pages."""
2017-01-27 22:40:07 +00:00
return 'https://chrome.google.com/reviews/components'
def const_download_url():
2017-07-29 09:57:35 +00:00
"""Base download URL."""
return ('https://clients2.google.com/service/update2/' +
'crx?response=redirect&nacl_arch=x86-64&' +
'prodversion=9999.0.9999.0&x=id%3D{}%26uc')
2017-01-27 22:40:07 +00:00
def const_categories():
2017-07-29 09:57:35 +00:00
"""List of known categories."""
2017-01-27 22:40:07 +00:00
return [
'extensions', 'ext/22-accessibility', 'ext/10-blogging',
'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
'ext/13-sports'
]
def const_support_payload(ext_id, start, end):
2017-07-29 09:57:35 +00:00
"""Payload for requesting support pages."""
2017-01-27 22:40:07 +00:00
return (
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
'"specs":[{{"type":"CommentThread",' +
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+ '"groups":"chrome_webstore_support",' + '"startindex":"{}",' +
'"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' +
'"internedValues":[]}}').format(ext_id, start, end)
def const_review_payload(ext_id, start, end):
2017-07-29 09:57:35 +00:00
"""Payload for requesting review pages."""
2017-01-27 22:40:07 +00:00
return (
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
'"specs":[{{"type":"CommentThread",' +
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+ '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' +
'"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
2017-11-28 23:40:28 +00:00
'"internedKeys":[],' + '"internedValues":[]}}').format(
ext_id, start, end)
2017-07-12 15:10:47 +00:00
def const_review_search_payload(params):
2017-07-29 09:57:35 +00:00
"""Payload for searches."""
2017-07-12 15:10:47 +00:00
pre = """req={"applicationId":94,"searchSpecs":["""
post = """]}&requestSource=widget"""
args = []
for extid, author, start, numresults, groups in params:
2017-07-12 15:10:47 +00:00
args += [
"""{{"requireComment":true,"entities":[{{"annotation":"""
"""{{"groups":{},"author":"{}","""
2017-07-12 15:10:47 +00:00
""""url":"http://chrome.google.com/extensions/permalink?id={}"}}}}],"""
""""matchExtraGroups":true,"startIndex":{},"numResults":{},"""
""""includeNicknames":true,"locale": {{"language": "en","country": "us"}}}}"""
.format(json.dumps(groups), author, extid, start, numresults)
2017-07-12 15:10:47 +00:00
]
return pre + ",".join(args) + post
2017-07-29 09:57:35 +00:00
def get_local_archive_dir(ext_id):
"""Local archive dir of extension."""
return "{}".format(ext_id[:3])
2017-06-16 22:19:13 +00:00
def archive_file(archivedir, ext_id):
2017-07-29 09:57:35 +00:00
"""Archive tar of an extension."""
2017-06-16 22:19:13 +00:00
return os.path.join(
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
2017-06-20 07:10:28 +00:00
def const_basedir():
"""Top-level directory for the extension crawler archive."""
if "EXTENSION_ARCHIVE" in os.environ:
return os.environ.get("EXTENSION_ARCHIVE")
else:
return "archive"
def const_parallel_downloads():
"""Number of parallel downloads."""
return 36
2017-11-28 23:40:28 +00:00
def const_verbose():
"""Default verbosity."""
return True
2018-04-21 18:00:07 +00:00
def const_use_process_pool():
"""Use ProcessPool (from module 'pebble') for concurrency."""
return False
2017-11-28 23:40:28 +00:00
2018-04-21 18:00:07 +00:00
def const_log_format(ext_id="-"*32):
return "%(process)6s %(asctime)s %(levelname)8s {} %(message)s".format(ext_id)
2017-11-28 23:40:28 +00:00
def const_discover():
"""Default configuration of discovery mode"""
return False
2017-08-27 17:28:19 +00:00
2018-04-05 16:32:11 +00:00
def const_ext_timeout():
"""Timeout for downloading an individual extension (2 hours)."""
2018-04-10 17:19:12 +00:00
return 2*60*60
2017-11-28 23:40:28 +00:00
2017-08-27 17:28:19 +00:00
def const_mysql_config_file():
return os.path.expanduser("~/.my.cnf")
2017-11-28 23:40:28 +00:00
def const_mysql_maxtries():
return 12
2017-11-28 23:40:28 +00:00
def const_mysql_try_wait():
return 300