From 9c4ba395585a03e049210832f72956ebb2a32529 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 28 Jan 2017 12:52:18 +0000 Subject: [PATCH] Refactoring. --- ExtensionCrawler/archive.py | 80 +++++++++++++++++++++++++++++++++++++ ExtensionCrawler/util.py | 76 +++++++++++++++++++++++++++++++++++ crawler | 53 ++---------------------- 3 files changed, 159 insertions(+), 50 deletions(-) create mode 100755 ExtensionCrawler/archive.py create mode 100644 ExtensionCrawler/util.py diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py new file mode 100755 index 0000000..2edc777 --- /dev/null +++ b/ExtensionCrawler/archive.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2016,2017 The University of Sheffield, UK +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import os +import sys +import glob +import re +import requests +from time import sleep +from random import randint +from datetime import datetime, timezone +from ExtensionCrawler.util import * +import dateutil +import dateutil.parser + + +def get_local_archive_dir(id): + return "{}/{}".format(id[:3],id) + +def get_local_archive_dirs(id): + return [get_local_archive_dir(id)] + +def write_text(dir, fname, text): + with open(os.path.join(dir, fname), 'w') as f: + f.write(text) + + +def store_request_metadata(dir, fname, request): + write_text(dir, fname + ".headers", str(request.headers)) + write_text(dir, fname + ".status", str(request.status_code)) + write_text(dir, fname + ".url", str(request.url)) + + +def store_request_text(dir, fname, request): + write_text(dir, fname, request.text) + store_request_metadata(dir, fname, request) + +def httpdate(dt): + weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()] + month = [ + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", + "Nov", "Dec" + ][dt.month - 1] + return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( + weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second) + + +def last_modified_utc_date(path): + if path is "": + return "" + return os.path.split(os.path.dirname(path))[1] + + +def last_modified_http_date(path): + if path is "": + return "" + return httpdate(dateutil.parser.parse(last_modified_utc_date(path))) +def last_crx(dir, extid): + old_archives = sorted( + glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx"))) + last_archive = "" + if old_archives != []: + last_archive = old_archives[-1] + return last_archive + diff --git a/ExtensionCrawler/util.py b/ExtensionCrawler/util.py new file mode 100644 index 0000000..278dfec --- /dev/null +++ b/ExtensionCrawler/util.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2016,2017 The University of Sheffield, UK +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +def const_sitemap_url(): + return "https://chrome.google.com/webstore/sitemap" + + +def const_sitemap_scheme(): + return "http://www.sitemaps.org/schemas/sitemap/0.9" + + +def const_overview_url(id): + return 'https://chrome.google.com/webstore/detail/{}'.format(id) + + +def const_store_url(): + return 'https://chrome.google.com/webstore' + + +def const_review_url(): + return 'https://chrome.google.com/reviews/components' + + +def const_support_url(): + return 'https://chrome.google.com/reviews/components' + + +def const_download_url(): + return 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc' + + +def const_categories(): + return [ + 'extensions', 'ext/22-accessibility', 'ext/10-blogging', + 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun', + 'ext/6-news', 'ext/28-photos', 'ext/7-productivity', + 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication', + 'ext/13-sports' + ] + + +def const_support_payload(ext_id, start, end): + return ( + 'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' + + '"specs":[{{"type":"CommentThread",' + + '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",' + + '"groups":"chrome_webstore_support",' + '"startindex":"{}",' + + '"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' + + '"internedValues":[]}}').format(ext_id, start, end) + + +def const_review_payload(ext_id, start, end): + return ( + 'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' + + '"specs":[{{"type":"CommentThread",' + + '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",' + + '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' + + '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' + + '"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start, + end) diff --git a/crawler b/crawler index c48b5e3..d9e9b6b 100755 --- a/crawler +++ b/crawler @@ -24,8 +24,9 @@ import requests from time import sleep from random import randint from datetime import datetime, timezone -import ExtensionCrawler.discover -import ExtensionCrawler.archive +from ExtensionCrawler.discover import * +from ExtensionCrawler.archive import * +from ExtensionCrawler.util import * import dateutil import dateutil.parser @@ -114,23 +115,6 @@ class UpdateResult: def google_dos_protection(max=3): sleep(randint(1, max) * .5) - -def write_text(dir, fname, text): - with open(os.path.join(dir, fname), 'w') as f: - f.write(text) - - -def store_request_metadata(dir, fname, request): - write_text(dir, fname + ".headers", str(request.headers)) - write_text(dir, fname + ".status", str(request.status_code)) - write_text(dir, fname + ".url", str(request.url)) - - -def store_request_text(dir, fname, request): - write_text(dir, fname, request.text) - store_request_metadata(dir, fname, request) - - def log(verbose, msg): if verbose: sys.stdout.write(msg) @@ -150,37 +134,6 @@ def update_overview(dir, verbose, ext_id): return RequestResult(res) -def httpdate(dt): - weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()] - month = [ - "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", - "Nov", "Dec" - ][dt.month - 1] - return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( - weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second) - - -def last_modified_utc_date(path): - if path is "": - return "" - return os.path.split(os.path.dirname(path))[1] - - -def last_modified_http_date(path): - if path is "": - return "" - return httpdate(dateutil.parser.parse(last_modified_utc_date(path))) - - -def last_crx(dir, extid): - old_archives = sorted( - glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx"))) - last_archive = "" - if old_archives != []: - last_archive = old_archives[-1] - return last_archive - - def validate_crx_response(res, extfilename): regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$') if not 'Content-Type' in res.headers: