From 3cdeba20b4cf09550905b513b51216790dcc2177 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 28 Jan 2017 13:15:05 +0000 Subject: [PATCH] Reformatting. --- ExtensionCrawler/archive.py | 57 +++++++++++++++--------------------- ExtensionCrawler/discover.py | 2 +- ExtensionCrawler/util.py | 4 ++- crawler | 6 ++-- 4 files changed, 31 insertions(+), 38 deletions(-) mode change 100755 => 100644 ExtensionCrawler/archive.py diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py old mode 100755 new mode 100644 index aa855b5..245831e --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -49,22 +49,22 @@ class RequestResult: self.exception = exception def is_ok(self): - return (self.exception is None) and (self.http_status==200) + return (self.exception is None) and (self.http_status == 200) def not_authorized(self): - return (self.exception is None) and (self.http_status==401) + return (self.exception is None) and (self.http_status == 401) def not_found(self): - return (self.exception is None) and (self.http_status==404) + return (self.exception is None) and (self.http_status == 404) def has_exception(self): return self.exception is not None def not_available(self): - return (self.exception is None) and (self.http_status==503) + return (self.exception is None) and (self.http_status == 503) def not_modified(self): - return ((self.exception is None) and (self.http_status==304)) + return ((self.exception is None) and (self.http_status == 304)) class UpdateResult: @@ -76,9 +76,10 @@ class UpdateResult: self.res_support = res_support def is_ok(self): - return (self.res_overview.is_ok() and (self.res_crx.is_ok() or self.res_crx.not_modified()) and ( - (self.res_reviews is None) or self.res_reviews.is_ok()) and ( - (self.res_support is None) or self.res_support.is_ok())) + return (self.res_overview.is_ok() and + (self.res_crx.is_ok() or self.res_crx.not_modified()) and + ((self.res_reviews is None) or self.res_reviews.is_ok()) and ( + (self.res_support is None) or self.res_support.is_ok())) def not_authorized(self): return (self.res_overview.not_authorized() or @@ -112,13 +113,14 @@ class UpdateResult: return self.res_crx.not_modified() - def get_local_archive_dir(id): - return "{}/{}".format(id[:3],id) + return "{}/{}".format(id[:3], id) + def get_local_archive_dirs(id): return [get_local_archive_dir(id)] + def write_text(dir, fname, text): with open(os.path.join(dir, fname), 'w') as f: f.write(text) @@ -134,6 +136,7 @@ def store_request_text(dir, fname, request): write_text(dir, fname, request.text) store_request_metadata(dir, fname, request) + def httpdate(dt): weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()] month = [ @@ -154,6 +157,8 @@ def last_modified_http_date(path): if path is "": return "" return httpdate(dateutil.parser.parse(last_modified_utc_date(path))) + + def last_crx(dir, extid): old_archives = sorted( glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx"))) @@ -163,10 +168,6 @@ def last_crx(dir, extid): return last_archive - - - - def update_overview(dir, verbose, ext_id): log(verbose, " * overview page: ") try: @@ -207,10 +208,9 @@ def update_crx(dir, verbose, ext_id): if last_crx_file is not "": headers = {'If-Modified-Since': last_crx_http_date} try: - res = requests.get( - const_download_url().format(ext_id), - stream=True, - headers=headers) + res = requests.get(const_download_url().format(ext_id), + stream=True, + headers=headers) log(verbose, "{}".format(str(res.status_code))) extfilename = os.path.basename(res.url) store_request_metadata(dir, extfilename, res) @@ -240,16 +240,12 @@ def update_reviews(dir, verbose, ext_id): try: google_dos_protection() res = requests.post( - const_review_url(), - data=const_review_payload(ext_id, "0", - "100")) + const_review_url(), data=const_review_payload(ext_id, "0", "100")) log(verbose, "{}/".format(str(res.status_code))) store_request_text(dir, 'reviews000-099.text', res) google_dos_protection() res = requests.post( - const_review_url(), - data=const_review_payload(ext_id, "0", - "100")) + const_review_url(), data=const_review_payload(ext_id, "0", "100")) log(verbose, "{}".format(str(res.status_code))) store_request_text(dir, 'reviews100-199.text', res) except Exception as e: @@ -267,15 +263,13 @@ def update_support(dir, verbose, ext_id): google_dos_protection() res = requests.post( const_support_url(), - data=const_support_payload(ext_id, "0", - "100")) + data=const_support_payload(ext_id, "0", "100")) log(verbose, "{}/".format(str(res.status_code))) store_request_text(dir, 'support000-099.text', res) google_dos_protection() res = requests.post( const_support_url(), - data=const_support_payload(ext_id, "100", - "100")) + data=const_support_payload(ext_id, "100", "100")) log(verbose, "{}".format(str(res.status_code))) store_request_text(dir, 'support100-199.text', res) except Exception as e: @@ -293,9 +287,7 @@ def update_extension(archivedir, verbose, forums, ext_id): log(verbose, "\n") date = datetime.now(timezone.utc).isoformat() dir = os.path.join( - os.path.join(archivedir, - get_local_archive_dir(ext_id)), - date) + os.path.join(archivedir, get_local_archive_dir(ext_id)), date) os.makedirs(dir, exist_ok=True) res_overview = update_overview(dir, verbose, ext_id) res_crx = update_crx(dir, verbose, ext_id) @@ -335,6 +327,3 @@ def get_forum_ext_ids(confdir, verbose): ids = f.readlines() ids = [x.strip() for x in ids] return ids - - - diff --git a/ExtensionCrawler/discover.py b/ExtensionCrawler/discover.py index bdd7fc0..0d0ae57 100644 --- a/ExtensionCrawler/discover.py +++ b/ExtensionCrawler/discover.py @@ -48,6 +48,7 @@ def crawl_nearly_all_of_ext_ids(): map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), []) return [re.search("[a-z]{32}", url).group(0) for url in overview_urls] + def get_new_ids(verbose, known_ids): log(verbose, "Discovering new ids ... \n") discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids() @@ -55,4 +56,3 @@ def get_new_ids(verbose, known_ids): log(verbose, " Discovered {} new extensions (out of {})\n".format( len(new_ids), len(discovered_ids))) return new_ids - diff --git a/ExtensionCrawler/util.py b/ExtensionCrawler/util.py index 4c7cf20..62da899 100644 --- a/ExtensionCrawler/util.py +++ b/ExtensionCrawler/util.py @@ -16,19 +16,21 @@ # along with this program. If not, see . # - import sys from time import sleep from random import randint from datetime import datetime, timezone + def google_dos_protection(max=3): sleep(randint(1, max) * .5) + def log(verbose, msg): if verbose: sys.stdout.write(msg) + def valueOf(value, default): if value is not None and value is not "": return value diff --git a/crawler b/crawler index dd02c67..35fabc9 100755 --- a/crawler +++ b/crawler @@ -44,11 +44,13 @@ def log_summary(verbose, res): log(verbose, "Summary:\n") log(verbose, " Updated {} out of {} extensions successfully\n".format( str(success), str(total))) - log(verbose, " Not authorized: {}\n".format(str(not_authorized))) + log(verbose, + " Not authorized: {}\n".format(str(not_authorized))) log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos))) log(verbose, " Not modified archives: {}\n".format(str(not_modified))) log(verbose, " Extensions not in store: {}\n".format(str(not_in_store))) - log(verbose, " Unknown exception: {}\n".format(str(has_exception))) + log(verbose, + " Unknown exception: {}\n".format(str(has_exception))) def main():