From c08124fa1773cfc0137ab43b9db0a4680e1113a4 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Fri, 16 Jun 2017 14:56:23 +0100 Subject: [PATCH 01/16] First version of sqlite generator. --- ExtensionCrawler/sqlite.py | 147 ++++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 4 deletions(-) diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 01786da..af14683 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # # Copyright (C) 2017 The University of Sheffield, UK -# +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -18,11 +18,150 @@ from ExtensionCrawler.config import * from ExtensionCrawler.util import * +from ExtensionCrawler.crx import * + + +from pathlib import Path +import sqlite3 +import re +from bs4 import BeautifulSoup +from zipfile import ZipFile +import json + + +def setup_tables(con): + # TODO: delete old db if schemas don't match + con.execute("""CREATE TABLE IF NOT EXISTS review (""" + """id INTEGER PRIMARY KEY,""" + """extid TEXT,""" + """date TEXT,""" + """user TEXT,""" + """reviewdate TEXT,""" + """rating TEXT,""" + """comment TEXT""" + """)""") + con.execute("""CREATE TABLE IF NOT EXISTS category (""" + """extid TEXT,""" + """date TEXT,""" + """category TEXT,""" + """PRIMARY KEY (extid, date, category)""" + """)""") + con.execute("""CREATE TABLE IF NOT EXISTS permission (""" + """crx_etag TEXT,""" + """permission TEXT,""" + """PRIMARY KEY (crx_etag, permission)""" + """)""") + con.execute("""CREATE TABLE IF NOT EXISTS crx (""" + """etag TEXT PRIMARY KEY,""" + """filename TEXT,""" + """publickey BLOB""" + """)""") + con.execute("""CREATE TABLE IF NOT EXISTS extension (""" + """extid TEXT,""" + """date TEXT,""" + """name TEXT,""" + """version TEXT,""" + """description TEXT,""" + """downloads INTEGER,""" + """fulldescription TEXT,""" + """developer TEXT,""" + """crx_etag TEXT,""" + """PRIMARY KEY (extid, date),""" + """FOREIGN KEY (crx_etag) REFERENCES crx(etag)""" + """)""") + +def get_etag(date, tmptardir): + header_path = list((tmptardir / date).glob("*.crx.headers"))[0] + + with open(header_path) as f: + return eval(f.read())["ETag"] + +def parse_and_insert_overview(ext_id, date, tmptardir, con): + overview_path = tmptardir / date / "overview.html" + with open(overview_path) as overview_file: + contents = overview_file.read() + + # Extract extension name + match = re.search("""""", contents) + name = match.group(1) if match else None + + # Extract extension version + match = re.search("""""", contents) + version = match.group(1) if match else None + + # Extract the short extension description as it appears on the overview page + match = re.search("""(.+?)""", contents) + categories = match.group(1).split(",") if match else None + + # Extracts the number of downloads + match = re.search("""user_count.*?(\d+)""", contents) + downloads = int(match.group(1)) if match else None + + # Extracts the full extension description as it appears on the overview page + doc = BeautifulSoup(contents, 'html.parser') + + desc = doc.find('div', itemprop="description") + full_description = desc.parent if desc and desc.parent else None + + developer = doc.find(class_=lambda cls: cls and "e-f-Me" in cls) + + etag = get_etag(date, tmptardir) + + con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?)", + (ext_id, date, name, version, description, downloads, + str(full_description), str(developer), etag)) + + for category in categories: + con.execute("INSERT INTO category VALUES (?,?,?)", + (ext_id, date, category)) + + +def parse_and_insert_crx(ext_id, date, tmptardir, con): + crx_path = list((tmptardir / date).glob("*.crx"))[0] + filename = crx_path.name + + etag = get_etag(date, tmptardir) + + with ZipFile(str(crx_path)) as f: + with f.open("manifest.json") as m: + try: + # There are some manifests that seem to have weird encodings... + manifest = json.loads(m.read().decode("utf-8-sig")) + if "permissions" in manifest: + for permission in manifest["permissions"]: + con.execute("INSERT INTO permission VALUES (?,?)", + (etag, str(permission))) + except json.decoder.JSONDecodeError: + pass + + public_key = read_crx(str(crx_path)).pk + + con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, public_key)) def update_sqlite(archivedir, tmptardir, verbose, ext_id, date): - indent = " " - txt = logmsg(verbose, "", indent + "* Updating SQLite ...") - txt = logmsg(verbose, txt, "") + tmptardir = Path(tmptardir) + indent = 11 * " " + + txt = "" + + db_path = Path(archivedir) / ext_id[:3] / (ext_id + ".sqlite") + txt = logmsg(verbose, txt, indent + 4 * " " + "- using db file {}\n".format(str(db_path))) + + with sqlite3.connect(str(db_path)) as con: + setup_tables(con) + parse_and_insert_overview(ext_id, date, tmptardir, con) + + etag = get_etag(date, tmptardir) + etag_already_in_db = list(con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag,)))[0][0] + if not etag_already_in_db: + txt = logmsg(verbose, txt, indent + 4 * " " + "- etag not found in db, parsing crx...") + parse_and_insert_crx(ext_id, date, tmptardir, con) + + #TODO: add reviews return txt From 6e2772711f99c581f4f9c48b9b0e5cc727f0c0b1 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Fri, 16 Jun 2017 20:40:48 +0100 Subject: [PATCH 02/16] Next version of sqlite generator. --- ExtensionCrawler/archive.py | 19 +++- ExtensionCrawler/sqlite.py | 203 +++++++++++++++++++++++++----------- 2 files changed, 156 insertions(+), 66 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 3db2fc3..559072b 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # # Copyright (C) 2016,2017 The University of Sheffield, UK -# +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -262,6 +262,7 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date): timeout=10, allow_redirects=True) etag = res.headers.get('Etag') + write_text(tmptardir, date, extfilename + ".etag", etag) logtxt = logmsg(verbose, logtxt, ( " - checking etag, last: {}\n" + " current: {}\n").format( @@ -287,6 +288,7 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date): for chunk in res.iter_content(chunk_size=512 * 1024): if chunk: # filter out keep-alive new chunks f.write(chunk) + write_text(tmptardir, date, extfilename + ".etag", res.headers.get("ETag")) except Exception as e: raise e logtxt = logmsg(verbose, logtxt, @@ -444,9 +446,16 @@ def update_extension(archivedir, verbose, forums, ext_id): except Exception: pass - msg_updatesqlite = update_sqlite(archivedir, tmptardir, verbose, ext_id, - date) - log(verbose, logtxt + msg_updatesqlite) + try: + msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, + verbose, 11 * " ") + logtxt = logmsg(verbose, logtxt, msg_updatesqlite) + except Exception as e: + logtxt = logmsg(verbose, logtxt, " * Eventually failed create sqlite files") + logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) + + tar_exception = e + try: shutil.rmtree(path=tmpdir) @@ -472,7 +481,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): log(verbose, "Updating {} extensions ({} including forums)\n".format( len(ext_ids), len(forums_ext_ids))) # First, update extensions with forums sequentially (and with delays) to - # avoid running into Googles DDOS detection. + # avoid running into Googles DDOS detection. log(verbose, " Updating {} extensions including forums (sequentially))\n".format( len(forums_ext_ids))) diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index af14683..7f5acca 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -19,6 +19,7 @@ from ExtensionCrawler.config import * from ExtensionCrawler.util import * from ExtensionCrawler.crx import * +from ExtensionCrawler.archive import * from pathlib import Path @@ -27,11 +28,17 @@ import re from bs4 import BeautifulSoup from zipfile import ZipFile import json +import os +import tempfile +import tarfile +class IncrementalSqliteUpdateError(Exception): + def __init__(self, reason="unknown"): + self.reason = reason def setup_tables(con): # TODO: delete old db if schemas don't match - con.execute("""CREATE TABLE IF NOT EXISTS review (""" + con.execute("""CREATE TABLE review (""" """id INTEGER PRIMARY KEY,""" """extid TEXT,""" """date TEXT,""" @@ -40,23 +47,23 @@ def setup_tables(con): """rating TEXT,""" """comment TEXT""" """)""") - con.execute("""CREATE TABLE IF NOT EXISTS category (""" + con.execute("""CREATE TABLE category (""" """extid TEXT,""" """date TEXT,""" """category TEXT,""" """PRIMARY KEY (extid, date, category)""" """)""") - con.execute("""CREATE TABLE IF NOT EXISTS permission (""" + con.execute("""CREATE TABLE permission (""" """crx_etag TEXT,""" """permission TEXT,""" """PRIMARY KEY (crx_etag, permission)""" """)""") - con.execute("""CREATE TABLE IF NOT EXISTS crx (""" + con.execute("""CREATE TABLE crx (""" """etag TEXT PRIMARY KEY,""" """filename TEXT,""" """publickey BLOB""" """)""") - con.execute("""CREATE TABLE IF NOT EXISTS extension (""" + con.execute("""CREATE TABLE extension (""" """extid TEXT,""" """date TEXT,""" """name TEXT,""" @@ -66,18 +73,31 @@ def setup_tables(con): """fulldescription TEXT,""" """developer TEXT,""" """crx_etag TEXT,""" + """crx_status INTEGER,""" + """overview_status INTEGER,""" + """lastupdated TEXT,""" """PRIMARY KEY (extid, date),""" """FOREIGN KEY (crx_etag) REFERENCES crx(etag)""" """)""") -def get_etag(date, tmptardir): - header_path = list((tmptardir / date).glob("*.crx.headers"))[0] - with open(header_path) as f: - return eval(f.read())["ETag"] +def get_etag(datepath): + etagpath = next(datepath.glob("*.etag"), None) -def parse_and_insert_overview(ext_id, date, tmptardir, con): - overview_path = tmptardir / date / "overview.html" + if etagpath: + with open(etagpath) as f: + return f.read() + +def get_overview_status(datepath): + with open(datepath / "overview.html.status") as f: + return int(f.read()) + +def get_crx_status(datepath): + with open(next(datepath.glob("*.crx.status"))) as f: + return int(f.read()) + +def parse_and_insert_overview(ext_id, date, datepath, con): + overview_path = datepath / "overview.html" with open(overview_path) as overview_file: contents = overview_file.read() @@ -89,10 +109,6 @@ def parse_and_insert_overview(ext_id, date, tmptardir, con): match = re.search("""""", contents) version = match.group(1) if match else None - # Extract the short extension description as it appears on the overview page - match = re.search("""(.+?)""", contents) categories = match.group(1).split(",") if match else None @@ -104,64 +120,129 @@ def parse_and_insert_overview(ext_id, date, tmptardir, con): # Extracts the full extension description as it appears on the overview page doc = BeautifulSoup(contents, 'html.parser') - desc = doc.find('div', itemprop="description") - full_description = desc.parent if desc and desc.parent else None + description_parent = doc.find('div', itemprop="description") + description = str(description_parent.contents[0]) if description_parent and description_parent.contents else None + full_description = str(description_parent.parent) if description_parent else None - developer = doc.find(class_=lambda cls: cls and "e-f-Me" in cls) + developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls) + developer = str(developer_parent.contents[0]) if developer_parent else None - etag = get_etag(date, tmptardir) + last_updated_parent = doc.find(class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls) + last_updated = str(last_updated_parent.contents[0]) if last_updated_parent else None - con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?)", + etag = get_etag(datepath) + + overview_status = get_overview_status(datepath) + + crx_status = get_crx_status(datepath) + + + con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (ext_id, date, name, version, description, downloads, - str(full_description), str(developer), etag)) + full_description, developer, etag, last_updated, overview_status, crx_status)) - for category in categories: - con.execute("INSERT INTO category VALUES (?,?,?)", - (ext_id, date, category)) + if categories: + for category in categories: + con.execute("INSERT INTO category VALUES (?,?,?)", + (ext_id, date, category)) -def parse_and_insert_crx(ext_id, date, tmptardir, con): - crx_path = list((tmptardir / date).glob("*.crx"))[0] - filename = crx_path.name - - etag = get_etag(date, tmptardir) - - with ZipFile(str(crx_path)) as f: - with f.open("manifest.json") as m: - try: - # There are some manifests that seem to have weird encodings... - manifest = json.loads(m.read().decode("utf-8-sig")) - if "permissions" in manifest: - for permission in manifest["permissions"]: - con.execute("INSERT INTO permission VALUES (?,?)", - (etag, str(permission))) - except json.decoder.JSONDecodeError: - pass - - public_key = read_crx(str(crx_path)).pk - - con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, public_key)) - - -def update_sqlite(archivedir, tmptardir, verbose, ext_id, date): - tmptardir = Path(tmptardir) - indent = 11 * " " - +def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent): txt = "" - db_path = Path(archivedir) / ext_id[:3] / (ext_id + ".sqlite") - txt = logmsg(verbose, txt, indent + 4 * " " + "- using db file {}\n".format(str(db_path))) + etag = get_etag(datepath) + crx_path = next(datepath.glob("*.crx"), None) + filename = crx_path.name + + try: + with ZipFile(str(crx_path)) as f: + with f.open("manifest.json") as m: + try: + # There are some manifests that seem to have weird encodings... + manifest = json.loads(m.read().decode("utf-8-sig")) + if "permissions" in manifest: + for permission in manifest["permissions"]: + con.execute("INSERT OR REPLACE INTO permission VALUES (?,?)", + (etag, str(permission))) + except json.decoder.JSONDecodeError: + pass + + public_key = read_crx(str(crx_path)).pk + + con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, public_key)) + except zipfile.BadZipFile as e: + txt = logmsg(verbose, txt, indent + "- {} is not a zip file\n" + .format(crx_path)) + return txt + + +def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent): + txt = "" + + txt = logmsg(verbose, txt, indent + "- updating using {}\n".format(datepath)) + + if not db_path.exists(): + raise IncrementalSqliteUpdateError("db file not found") with sqlite3.connect(str(db_path)) as con: - setup_tables(con) - parse_and_insert_overview(ext_id, date, tmptardir, con) + parse_and_insert_overview(ext_id, date, datepath, con) - etag = get_etag(date, tmptardir) - etag_already_in_db = list(con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag,)))[0][0] - if not etag_already_in_db: - txt = logmsg(verbose, txt, indent + 4 * " " + "- etag not found in db, parsing crx...") - parse_and_insert_crx(ext_id, date, tmptardir, con) + crx_path = next(datepath.glob("*.crx"), None) - #TODO: add reviews + etag = get_etag(datepath) + etag_already_in_db = next(con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag,)))[0] + if etag and not etag_already_in_db: + if crx_path: + parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent) + else: + raise IncrementalSqliteUpdateError("etag not in db and no crx file present") + + return txt + + +def update_sqlite_full(db_path, archivedir, ext_id, verbose, indent): + txt = "" + + if db_path.exists(): + os.remove(db_path) + + with tempfile.TemporaryDirectory() as tmpdir: + with tarfile.open(archivedir / ext_id[:3] / (ext_id + ".tar")) as t: + t.extractall(tmpdir) + iddir = Path(tmpdir) / ext_id + + with sqlite3.connect(str(db_path)) as con: + setup_tables(con) + for datepath in sorted(iddir.iterdir()): + date = datepath.name + updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent) + txt = logmsg(verbose, txt, updatetxt) + + return txt + +def update_sqlite(archivedir, tmptardir, ext_id, date, verbose, indent): + txt = "" + + datepath = Path(tmptardir) / date + archivedir = Path(archivedir) + indent2 = indent + 4 * " " + + + txt = logmsg(verbose, txt, indent + "* extracting information into SQLite db...\n") + + db_path = Path(archivedir) / ext_id[:3] / (ext_id + ".sqlite") + + try: + txt = logmsg(verbose, txt, indent2 + "- attempting incremental update...\n") + updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent2) + txt = logmsg(verbose, txt, updatetxt) + except IncrementalSqliteUpdateError as e: + txt = logmsg(verbose, txt, indent2 + "- incremental update failed: {}\n".format(e.reason)) + txt = logmsg(verbose, txt, indent2 + "- regenerating full db...\n") + try: + fullmsg = update_sqlite_full(db_path, archivedir, ext_id, verbose, indent2) + txt = logmsg(verbose, txt, fullmsg) + except IncrementalSqliteUpdateError as e: + txt = logmsg(verbose, txt, indent2 + "- full sqlite update failed: {}, giving up\n".format(e.reason)) return txt From 9f174f6785ccc39cbc822e2e91ff413238ede80f Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Fri, 16 Jun 2017 22:38:48 +0100 Subject: [PATCH 03/16] Downport to python 3.5. --- ExtensionCrawler/sqlite.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 7f5acca..cd5d97d 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -32,6 +32,17 @@ import os import tempfile import tarfile + + +def get_local_archive_dir(id): + return "{}".format(id[:3]) + +def archive_file(archivedir,ext_id): + return os.path.join(str(archivedir), get_local_archive_dir(ext_id), + ext_id + ".tar") + + + class IncrementalSqliteUpdateError(Exception): def __init__(self, reason="unknown"): self.reason = reason @@ -85,20 +96,20 @@ def get_etag(datepath): etagpath = next(datepath.glob("*.etag"), None) if etagpath: - with open(etagpath) as f: + with open(str(etagpath)) as f: return f.read() def get_overview_status(datepath): - with open(datepath / "overview.html.status") as f: + with open(str(datepath / "overview.html.status")) as f: return int(f.read()) def get_crx_status(datepath): - with open(next(datepath.glob("*.crx.status"))) as f: + with open(str(next(datepath.glob("*.crx.status")))) as f: return int(f.read()) def parse_and_insert_overview(ext_id, date, datepath, con): overview_path = datepath / "overview.html" - with open(overview_path) as overview_file: + with open(str(overview_path)) as overview_file: contents = overview_file.read() # Extract extension name @@ -207,7 +218,8 @@ def update_sqlite_full(db_path, archivedir, ext_id, verbose, indent): os.remove(db_path) with tempfile.TemporaryDirectory() as tmpdir: - with tarfile.open(archivedir / ext_id[:3] / (ext_id + ".tar")) as t: + tar = archive_file(archivedir,ext_id) + with tarfile.open(tar) as t: t.extractall(tmpdir) iddir = Path(tmpdir) / ext_id From 012c285c84a086616d756f80c6e2eb1cdb0a0b69 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Fri, 16 Jun 2017 22:46:04 +0100 Subject: [PATCH 04/16] Added assert to ensure Python 3.5. --- crawler | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crawler b/crawler index 6dbb472..6fac7c6 100755 --- a/crawler +++ b/crawler @@ -33,6 +33,8 @@ import dateutil.parser import time import getopt +# Script should run with python 3.5 +assert sys.version_info >= (3,5) and sys.version_info < (3,6) def write_log(dir, fname, text): os.makedirs(dir, exist_ok=True) From 5dd424c06d12fb502cb61ac28c5a9863f309c48a Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Fri, 16 Jun 2017 22:48:37 +0100 Subject: [PATCH 05/16] Removed obsolete permstats.py. --- permstats.py | 173 --------------------------------------------------- 1 file changed, 173 deletions(-) delete mode 100755 permstats.py diff --git a/permstats.py b/permstats.py deleted file mode 100755 index ab3f7cd..0000000 --- a/permstats.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (C) 2016 The University of Sheffield, UK -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from zipfile import ZipFile -import argparse -import json -import sys -import os -from jsmin import jsmin -import re - -regex_concrete_url = re.compile(r'^.*://.*[a-z0-9]+\.[a-z]+.*$') - - -class PermissionHandlerPrintNames: - def __init__(self, permname): - self.permname = permname - self.extinfo = {} - - def handle_permission(self, extid, permobj, path): - if self.permname in str(permobj): - with open(os.path.join(path, 'metadata.json')) as f: - metadata = json.load(f) - self.extinfo[extid] = '{} | {} | {}'.format(metadata[1], - metadata[6], path) - - def print_result(self, fileobj, delim): - fileobj.write('Extensions that use permission "{}":\n\n'.format( - self.permname)) - for extid in self.extinfo: - fileobj.write('{}\n'.format(self.extinfo[extid])) - fileobj.write('\n\n') - - -class PermissionHandler: - def __init__(self): - self.permissions = {} - self.extids = set() - - def handle_permission(self, extid, permobj, path): - self.extids.add(extid) - perm = str(permobj) - if not perm in self.permissions: - self.permissions[perm] = 0 - self.permissions[perm] += 1 - - def print_result(self, fileobj, delim): - fileobj.write('Total: {} extensions\n'.format(len(self.extids))) - for perm in sorted( - self.permissions, key=self.permissions.get, reverse=True): - fileobj.write('{}{}{}{}{:.2%}\n'.format( - perm, delim, self.permissions[perm], delim, - float(self.permissions[perm]) / len(self.extids))) - fileobj.write('\n\n') - - -class PermissionHandlerCondensed: - def __init__(self): - self.permissions = {} - self.extids = set() - self.exts_with_concrete_urls = set() - - def handle_permission(self, extid, permobj, path): - self.extids.add(extid) - - perm = str(permobj) - if regex_concrete_url.match(perm): - if extid in self.exts_with_concrete_urls: - return - self.exts_with_concrete_urls.add(extid) - perm = '<<<{}>>>'.format(regex_concrete_url.pattern) - if not perm in self.permissions: - self.permissions[perm] = 0 - self.permissions[perm] += 1 - - def print_result(self, fileobj, delim): - fileobj.write('Condensed. Total: {} extensions\n'.format( - len(self.extids))) - for perm in sorted( - self.permissions, key=self.permissions.get, reverse=True): - fileobj.write('{}{}{}{}{:.2%}\n'.format( - perm, delim, self.permissions[perm], delim, - float(self.permissions[perm]) / len(self.extids))) - fileobj.write('\n\n') - - -class PermissionStatisticGenerator: - def run(category_folder, permhandlers): - for root, dirs, files in os.walk(category_folder): - crxfile = next((f for f in files if f.endswith('.crx')), None) - if crxfile: - extid = os.path.basename(root) - with ZipFile(os.path.join(root, crxfile)) as zipfile: - with zipfile.open('manifest.json') as f: - content = jsmin(f.read().decode()) - - # This is needed to strip weird BOMs ... - first_bracket = content.find('{') - if first_bracket >= 0: - content = content[first_bracket:] - - manifest = json.loads(content) - if 'permissions' in manifest: - for permobj in manifest['permissions']: - for handler in permhandlers: - handler.handle_permission(extid, permobj, - root) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Prints statistics about the requested permissions of downloaded extensions.' - ) - parser.add_argument( - 'dir', - help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.' - ) - parser.add_argument( - '-d', - '--delim', - default='\t', - help='Delimiter used for the statistics output.') - parser.add_argument( - '-o', - '--output', - default=sys.stdout, - type=argparse.FileType('w'), - help='Save the statistics into a file.') - parser.add_argument( - '-p', - '--permission', - help='Prints out all extension names and descriptions that use the given permission.' - ) - parser.add_argument( - '-c', - '--categories', - action='store_true', - help='Print the results for each category separately.') - - args = parser.parse_args() - - category_folders = [args.dir] - if args.categories: - category_folders += [ - os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1] - ] - - for category_folder in category_folders: - args.output.write('Results for category {}:\n\n'.format( - category_folder)) - if args.permission: - handlers = [PermissionHandlerPrintNames(args.permission)] - else: - handlers = [PermissionHandler(), PermissionHandlerCondensed()] - PermissionStatisticGenerator.run(category_folder, handlers) - - for handler in handlers: - handler.print_result(args.output, args.delim) From 1c8d68d49542e75c50fa4e6f77b32fba65eec909 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Fri, 16 Jun 2017 23:09:23 +0100 Subject: [PATCH 06/16] Moved path utility functions into config module. --- ExtensionCrawler/archive.py | 6 +----- ExtensionCrawler/config.py | 8 ++++++++ ExtensionCrawler/sqlite.py | 12 +----------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 559072b..cdce875 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -26,7 +26,7 @@ from random import randint import datetime from ExtensionCrawler.config import * from ExtensionCrawler.util import * -from ExtensionCrawler.archive import * +from ExtensionCrawler.archive import archive_file from ExtensionCrawler.sqlite import * import dateutil import dateutil.parser @@ -129,10 +129,6 @@ class UpdateResult: return self.exception is not None -def get_local_archive_dir(id): - return "{}".format(id[:3]) - - def write_text(tardir, date, fname, text): dir = os.path.join(tardir, date) os.makedirs(dir, exist_ok=True) diff --git a/ExtensionCrawler/config.py b/ExtensionCrawler/config.py index 278dfec..5140889 100644 --- a/ExtensionCrawler/config.py +++ b/ExtensionCrawler/config.py @@ -16,6 +16,7 @@ # along with this program. If not, see . # +import os def const_sitemap_url(): return "https://chrome.google.com/webstore/sitemap" @@ -74,3 +75,10 @@ def const_review_payload(ext_id, start, end): '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' + '"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start, end) + +def get_local_archive_dir(id): + return "{}".format(id[:3]) + +def archive_file(archivedir,ext_id): + return os.path.join(str(archivedir), get_local_archive_dir(ext_id), + ext_id + ".tar") diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index cd5d97d..70c7e17 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -19,8 +19,8 @@ from ExtensionCrawler.config import * from ExtensionCrawler.util import * from ExtensionCrawler.crx import * -from ExtensionCrawler.archive import * +from ExtensionCrawler import archive from pathlib import Path import sqlite3 @@ -33,16 +33,6 @@ import tempfile import tarfile - -def get_local_archive_dir(id): - return "{}".format(id[:3]) - -def archive_file(archivedir,ext_id): - return os.path.join(str(archivedir), get_local_archive_dir(ext_id), - ext_id + ".tar") - - - class IncrementalSqliteUpdateError(Exception): def __init__(self, reason="unknown"): self.reason = reason From 86a608c6a154b070e58b35e05a1ae21d8dd6da5f Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Fri, 16 Jun 2017 23:19:13 +0100 Subject: [PATCH 07/16] Re-formatting. --- ExtensionCrawler/archive.py | 9 ++-- ExtensionCrawler/config.py | 9 ++-- ExtensionCrawler/sqlite.py | 90 ++++++++++++++++++++++++------------- crawler | 3 +- 4 files changed, 72 insertions(+), 39 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index cdce875..fbb3eb8 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -284,7 +284,8 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date): for chunk in res.iter_content(chunk_size=512 * 1024): if chunk: # filter out keep-alive new chunks f.write(chunk) - write_text(tmptardir, date, extfilename + ".etag", res.headers.get("ETag")) + write_text(tmptardir, date, extfilename + ".etag", + res.headers.get("ETag")) except Exception as e: raise e logtxt = logmsg(verbose, logtxt, @@ -444,15 +445,15 @@ def update_extension(archivedir, verbose, forums, ext_id): try: msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, - verbose, 11 * " ") + verbose, 11 * " ") logtxt = logmsg(verbose, logtxt, msg_updatesqlite) except Exception as e: - logtxt = logmsg(verbose, logtxt, " * Eventually failed create sqlite files") + logtxt = logmsg(verbose, logtxt, + " * Eventually failed create sqlite files") logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) tar_exception = e - try: shutil.rmtree(path=tmpdir) except Exception as e: diff --git a/ExtensionCrawler/config.py b/ExtensionCrawler/config.py index 5140889..d65f70d 100644 --- a/ExtensionCrawler/config.py +++ b/ExtensionCrawler/config.py @@ -18,6 +18,7 @@ import os + def const_sitemap_url(): return "https://chrome.google.com/webstore/sitemap" @@ -76,9 +77,11 @@ def const_review_payload(ext_id, start, end): '"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start, end) + def get_local_archive_dir(id): return "{}".format(id[:3]) -def archive_file(archivedir,ext_id): - return os.path.join(str(archivedir), get_local_archive_dir(ext_id), - ext_id + ".tar") + +def archive_file(archivedir, ext_id): + return os.path.join( + str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar") diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 70c7e17..796cc8b 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -20,7 +20,7 @@ from ExtensionCrawler.config import * from ExtensionCrawler.util import * from ExtensionCrawler.crx import * -from ExtensionCrawler import archive +from ExtensionCrawler import archive from pathlib import Path import sqlite3 @@ -37,6 +37,7 @@ class IncrementalSqliteUpdateError(Exception): def __init__(self, reason="unknown"): self.reason = reason + def setup_tables(con): # TODO: delete old db if schemas don't match con.execute("""CREATE TABLE review (""" @@ -89,29 +90,35 @@ def get_etag(datepath): with open(str(etagpath)) as f: return f.read() + def get_overview_status(datepath): with open(str(datepath / "overview.html.status")) as f: - return int(f.read()) + return int(f.read()) + def get_crx_status(datepath): with open(str(next(datepath.glob("*.crx.status")))) as f: return int(f.read()) + def parse_and_insert_overview(ext_id, date, datepath, con): overview_path = datepath / "overview.html" with open(str(overview_path)) as overview_file: contents = overview_file.read() # Extract extension name - match = re.search("""""", contents) + match = re.search("""""", + contents) name = match.group(1) if match else None # Extract extension version - match = re.search("""""", contents) + match = re.search("""""", + contents) version = match.group(1) if match else None # Extracts extension categories - match = re.search("""Attribute name="category">(.+?)""", contents) + match = re.search("""Attribute name="category">(.+?)""", + contents) categories = match.group(1).split(",") if match else None # Extracts the number of downloads @@ -122,14 +129,19 @@ def parse_and_insert_overview(ext_id, date, datepath, con): doc = BeautifulSoup(contents, 'html.parser') description_parent = doc.find('div', itemprop="description") - description = str(description_parent.contents[0]) if description_parent and description_parent.contents else None - full_description = str(description_parent.parent) if description_parent else None + description = str(description_parent.contents[ + 0]) if description_parent and description_parent.contents else None + full_description = str( + description_parent.parent) if description_parent else None developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls) - developer = str(developer_parent.contents[0]) if developer_parent else None + developer = str( + developer_parent.contents[0]) if developer_parent else None - last_updated_parent = doc.find(class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls) - last_updated = str(last_updated_parent.contents[0]) if last_updated_parent else None + last_updated_parent = doc.find( + class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls) + last_updated = str( + last_updated_parent.contents[0]) if last_updated_parent else None etag = get_etag(datepath) @@ -137,10 +149,10 @@ def parse_and_insert_overview(ext_id, date, datepath, con): crx_status = get_crx_status(datepath) - con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (ext_id, date, name, version, description, downloads, - full_description, developer, etag, last_updated, overview_status, crx_status)) + full_description, developer, etag, last_updated, + overview_status, crx_status)) if categories: for category in categories: @@ -163,24 +175,28 @@ def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent): manifest = json.loads(m.read().decode("utf-8-sig")) if "permissions" in manifest: for permission in manifest["permissions"]: - con.execute("INSERT OR REPLACE INTO permission VALUES (?,?)", - (etag, str(permission))) + con.execute( + "INSERT OR REPLACE INTO permission VALUES (?,?)", + (etag, str(permission))) except json.decoder.JSONDecodeError: pass public_key = read_crx(str(crx_path)).pk - con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, public_key)) + con.execute("INSERT INTO crx VALUES (?,?,?)", + (etag, filename, public_key)) except zipfile.BadZipFile as e: txt = logmsg(verbose, txt, indent + "- {} is not a zip file\n" - .format(crx_path)) + .format(crx_path)) return txt -def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent): +def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, + indent): txt = "" - txt = logmsg(verbose, txt, indent + "- updating using {}\n".format(datepath)) + txt = logmsg(verbose, txt, + indent + "- updating using {}\n".format(datepath)) if not db_path.exists(): raise IncrementalSqliteUpdateError("db file not found") @@ -191,12 +207,16 @@ def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent): crx_path = next(datepath.glob("*.crx"), None) etag = get_etag(datepath) - etag_already_in_db = next(con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag,)))[0] + etag_already_in_db = next( + con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, + )))[0] if etag and not etag_already_in_db: if crx_path: - parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent) + parse_and_insert_crx(ext_id, date, datepath, con, verbose, + indent) else: - raise IncrementalSqliteUpdateError("etag not in db and no crx file present") + raise IncrementalSqliteUpdateError( + "etag not in db and no crx file present") return txt @@ -208,8 +228,8 @@ def update_sqlite_full(db_path, archivedir, ext_id, verbose, indent): os.remove(db_path) with tempfile.TemporaryDirectory() as tmpdir: - tar = archive_file(archivedir,ext_id) - with tarfile.open(tar) as t: + tar = archive_file(archivedir, ext_id) + with tarfile.open(tar) as t: t.extractall(tmpdir) iddir = Path(tmpdir) / ext_id @@ -217,11 +237,13 @@ def update_sqlite_full(db_path, archivedir, ext_id, verbose, indent): setup_tables(con) for datepath in sorted(iddir.iterdir()): date = datepath.name - updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent) + updatetxt = update_sqlite_incremental( + db_path, datepath, ext_id, date, verbose, indent) txt = logmsg(verbose, txt, updatetxt) return txt + def update_sqlite(archivedir, tmptardir, ext_id, date, verbose, indent): txt = "" @@ -229,22 +251,28 @@ def update_sqlite(archivedir, tmptardir, ext_id, date, verbose, indent): archivedir = Path(archivedir) indent2 = indent + 4 * " " - - txt = logmsg(verbose, txt, indent + "* extracting information into SQLite db...\n") + txt = logmsg(verbose, txt, + indent + "* extracting information into SQLite db...\n") db_path = Path(archivedir) / ext_id[:3] / (ext_id + ".sqlite") try: - txt = logmsg(verbose, txt, indent2 + "- attempting incremental update...\n") - updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent2) + txt = logmsg(verbose, txt, + indent2 + "- attempting incremental update...\n") + updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, + verbose, indent2) txt = logmsg(verbose, txt, updatetxt) except IncrementalSqliteUpdateError as e: - txt = logmsg(verbose, txt, indent2 + "- incremental update failed: {}\n".format(e.reason)) + txt = logmsg(verbose, txt, indent2 + + "- incremental update failed: {}\n".format(e.reason)) txt = logmsg(verbose, txt, indent2 + "- regenerating full db...\n") try: - fullmsg = update_sqlite_full(db_path, archivedir, ext_id, verbose, indent2) + fullmsg = update_sqlite_full(db_path, archivedir, ext_id, verbose, + indent2) txt = logmsg(verbose, txt, fullmsg) except IncrementalSqliteUpdateError as e: - txt = logmsg(verbose, txt, indent2 + "- full sqlite update failed: {}, giving up\n".format(e.reason)) + txt = logmsg(verbose, txt, indent2 + + "- full sqlite update failed: {}, giving up\n".format( + e.reason)) return txt diff --git a/crawler b/crawler index 6fac7c6..e60eca0 100755 --- a/crawler +++ b/crawler @@ -34,7 +34,8 @@ import time import getopt # Script should run with python 3.5 -assert sys.version_info >= (3,5) and sys.version_info < (3,6) +assert sys.version_info >= (3, 5) and sys.version_info < (3, 6) + def write_log(dir, fname, text): os.makedirs(dir, exist_ok=True) From c4a5c5a231c28b3c575ef7f77b4db4fc1ca1d7ba Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Fri, 16 Jun 2017 23:32:52 +0100 Subject: [PATCH 08/16] Ignore non extensions ids in forums.conf. --- ExtensionCrawler/archive.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index fbb3eb8..8ffc94c 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -513,5 +513,6 @@ def get_existing_ids(archivedir, verbose): def get_forum_ext_ids(confdir, verbose): with open(os.path.join(confdir, "forums.conf")) as f: ids = f.readlines() + r = re.compile('^[a-p]+$') ids = [x.strip() for x in ids] - return ids + return filter(r.match, ids) From 97460c498f0be0b054fc9ba1efeb0b067f93c172 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 17 Jun 2017 00:43:40 +0100 Subject: [PATCH 09/16] Basic support for logging of errors related to SQL import/update. --- ExtensionCrawler/archive.py | 19 ++++++++++++++----- crawler | 7 +++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 8ffc94c..57c224c 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -76,7 +76,7 @@ class RequestResult: class UpdateResult: def __init__(self, id, is_new, exception, res_overview, res_crx, - res_reviews, res_support): + res_reviews, res_support,res_sql): self.id = id self.new = is_new self.exception = exception @@ -84,6 +84,7 @@ class UpdateResult: self.res_crx = res_crx self.res_reviews = res_reviews self.res_support = res_support + self.res_sql = res_sql def is_new(self): return self.new @@ -128,6 +129,9 @@ class UpdateResult: def corrupt_tar(self): return self.exception is not None + def sql_exception(self): + return self.res_sql is not None + def write_text(tardir, date, fname, text): dir = os.path.join(tardir, date) @@ -354,6 +358,7 @@ def update_extension(archivedir, verbose, forums, ext_id): logtxt = logmsg(verbose, "", " Updating {}".format(ext_id)) is_new = False tar_exception = None + sql_exception = None tmptardir = "" tmptar = "" @@ -380,7 +385,7 @@ def update_extension(archivedir, verbose, forums, ext_id): logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) tar_exception = e return UpdateResult(ext_id, is_new, tar_exception, res_overview, - res_crx, res_reviews, res_support) + res_crx, res_reviews, res_support, sql_exception) res_overview, msg_overview = update_overview(tmptardir, date, verbose, ext_id) @@ -452,8 +457,12 @@ def update_extension(archivedir, verbose, forums, ext_id): " * Eventually failed create sqlite files") logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) - tar_exception = e + sql_exception = e + try: + write_text(tardir, date, ext_id + ".sql.exception", str(e)) + except Exception as e: + pass try: shutil.rmtree(path=tmpdir) except Exception as e: @@ -467,7 +476,7 @@ def update_extension(archivedir, verbose, forums, ext_id): pass return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx, - res_reviews, res_support) + res_reviews, res_support, sql_exception) def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): @@ -515,4 +524,4 @@ def get_forum_ext_ids(confdir, verbose): ids = f.readlines() r = re.compile('^[a-p]+$') ids = [x.strip() for x in ids] - return filter(r.match, ids) + return list(filter(r.match, ids)) diff --git a/crawler b/crawler index e60eca0..44ed7fb 100755 --- a/crawler +++ b/crawler @@ -81,6 +81,11 @@ def log_failures_to_file(dir, today, res): sorted(map(lambda x: x.id, filter(lambda x: x.corrupt_tar(), res))), "") write_log(dir, today + "-file-corruption.log", file_corruption) + sql_exception = reduce( + lambda x, y: x + "\n" + y, + sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))), + "") + write_log(dir, today + "-sql-exception.log", sql_exception) def log_summary(verbose, res, stderr=False, runtime=0): @@ -98,6 +103,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): not_in_store = len(list(filter(lambda x: x.not_in_store(), res))) not_modified = len(list(filter(lambda x: x.not_modified(), res))) corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res)) + sql_exception = len(list(filter(lambda x: x.sql_exception(), res))) new = len(list(filter(lambda x: x.is_new(), res))) updated = len( @@ -115,6 +121,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): p(" Extensions not in store: {:8d}\n".format(not_in_store)) p(" Unknown exception: {:8d}\n".format(has_exception)) p(" Corrupt tar archives: {:8d}\n".format(len(corrupt_tar_archives))) + p(" SQL exception: {:8d}\n".format(sql_exception)) p(" Total runtime: {}\n".format( str(datetime.timedelta(seconds=int(runtime))))) From 8fcc7ab99f8a296f41e98cbdf2ff199e90c0d6ae Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 17 Jun 2017 00:48:34 +0100 Subject: [PATCH 10/16] Fixed logging. --- ExtensionCrawler/archive.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 57c224c..679f77c 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -475,6 +475,7 @@ def update_extension(archivedir, verbose, forums, ext_id): except Exception: pass + log(verbose, logtxt) return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx, res_reviews, res_support, sql_exception) From 760ac171f16cddfea9de6e522c0f3af74a2a21b1 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 17 Jun 2017 15:43:18 +0100 Subject: [PATCH 11/16] Releaxed supported version to 3.4 or 3.5. --- crawler | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler b/crawler index 44ed7fb..bb76dd9 100755 --- a/crawler +++ b/crawler @@ -33,8 +33,8 @@ import dateutil.parser import time import getopt -# Script should run with python 3.5 -assert sys.version_info >= (3, 5) and sys.version_info < (3, 6) +# Script should run with python 3.4 or 3.5 +assert sys.version_info >= (3, 4) and sys.version_info < (3, 6) def write_log(dir, fname, text): From 7f24a9da7abcba981eac15455ccac52ad54644a0 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Sat, 17 Jun 2017 17:10:18 +0100 Subject: [PATCH 12/16] Split db creation into incremental part and separate full regeneration script. --- ExtensionCrawler/sqlite.py | 211 +++++++++++++------------------------ create_db | 120 +++++++++++++++++++++ 2 files changed, 196 insertions(+), 135 deletions(-) create mode 100755 create_db diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 796cc8b..369379d 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 # # Copyright (C) 2017 The University of Sheffield, UK # @@ -22,7 +21,6 @@ from ExtensionCrawler.crx import * from ExtensionCrawler import archive -from pathlib import Path import sqlite3 import re from bs4 import BeautifulSoup @@ -31,79 +29,62 @@ import json import os import tempfile import tarfile +import glob -class IncrementalSqliteUpdateError(Exception): +class SqliteUpdateError(Exception): def __init__(self, reason="unknown"): self.reason = reason -def setup_tables(con): - # TODO: delete old db if schemas don't match - con.execute("""CREATE TABLE review (""" - """id INTEGER PRIMARY KEY,""" - """extid TEXT,""" - """date TEXT,""" - """user TEXT,""" - """reviewdate TEXT,""" - """rating TEXT,""" - """comment TEXT""" - """)""") - con.execute("""CREATE TABLE category (""" - """extid TEXT,""" - """date TEXT,""" - """category TEXT,""" - """PRIMARY KEY (extid, date, category)""" - """)""") - con.execute("""CREATE TABLE permission (""" - """crx_etag TEXT,""" - """permission TEXT,""" - """PRIMARY KEY (crx_etag, permission)""" - """)""") - con.execute("""CREATE TABLE crx (""" - """etag TEXT PRIMARY KEY,""" - """filename TEXT,""" - """publickey BLOB""" - """)""") - con.execute("""CREATE TABLE extension (""" - """extid TEXT,""" - """date TEXT,""" - """name TEXT,""" - """version TEXT,""" - """description TEXT,""" - """downloads INTEGER,""" - """fulldescription TEXT,""" - """developer TEXT,""" - """crx_etag TEXT,""" - """crx_status INTEGER,""" - """overview_status INTEGER,""" - """lastupdated TEXT,""" - """PRIMARY KEY (extid, date),""" - """FOREIGN KEY (crx_etag) REFERENCES crx(etag)""" - """)""") - - -def get_etag(datepath): - etagpath = next(datepath.glob("*.etag"), None) - +def get_etag(ext_id, datepath, con): + #Trying etag file + etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None) if etagpath: - with open(str(etagpath)) as f: + with open(etagpath) as f: return f.read() + #Trying to parse header file for etag + headerpath = next( + iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None) + if headerpath: + with open(headerpath) as f: + headers = eval(f.read()) + if "ETag" in headers: + return headers["ETag"] + + #Trying to look up previous etag in database + linkpath = next( + iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None) + if linkpath: + with open(linkpath) as f: + link = f.read() + linked_date = link[3:].split("/")[0] + + row = next( + con.execute( + "SELECT crx_etag FROM extension WHERE extid=? AND date=?", + (ext_id, linked_date)), None) + if row: + return row[0] + def get_overview_status(datepath): - with open(str(datepath / "overview.html.status")) as f: + with open(os.path.join(datepath, "overview.html.status")) as f: return int(f.read()) def get_crx_status(datepath): - with open(str(next(datepath.glob("*.crx.status")))) as f: - return int(f.read()) + statuspath = next( + iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None) + if statuspath: + with open(statuspath) as f: + return int(f.read()) def parse_and_insert_overview(ext_id, date, datepath, con): - overview_path = datepath / "overview.html" - with open(str(overview_path)) as overview_file: + overview_path = os.path.join(datepath, "overview.html") + with open(overview_path) as overview_file: contents = overview_file.read() # Extract extension name @@ -143,7 +124,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con): last_updated = str( last_updated_parent.contents[0]) if last_updated_parent else None - etag = get_etag(datepath) + etag = get_etag(ext_id, datepath, con) overview_status = get_overview_status(datepath) @@ -160,35 +141,28 @@ def parse_and_insert_overview(ext_id, date, datepath, con): (ext_id, date, category)) -def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent): - txt = "" +def parse_and_insert_crx(ext_id, date, datepath, con): + etag = get_etag(ext_id, datepath, con) + crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) + filename = os.path.basename(crx_path) - etag = get_etag(datepath) - crx_path = next(datepath.glob("*.crx"), None) - filename = crx_path.name + with ZipFile(crx_path) as f: + with f.open("manifest.json") as m: + try: + # There are some manifests that seem to have weird encodings... + manifest = json.loads(m.read().decode("utf-8-sig")) + if "permissions" in manifest: + for permission in manifest["permissions"]: + con.execute( + "INSERT OR REPLACE INTO permission VALUES (?,?)", + (etag, str(permission))) + except json.decoder.JSONDecodeError: + pass - try: - with ZipFile(str(crx_path)) as f: - with f.open("manifest.json") as m: - try: - # There are some manifests that seem to have weird encodings... - manifest = json.loads(m.read().decode("utf-8-sig")) - if "permissions" in manifest: - for permission in manifest["permissions"]: - con.execute( - "INSERT OR REPLACE INTO permission VALUES (?,?)", - (etag, str(permission))) - except json.decoder.JSONDecodeError: - pass + public_key = read_crx(crx_path).pk - public_key = read_crx(str(crx_path)).pk - - con.execute("INSERT INTO crx VALUES (?,?,?)", - (etag, filename, public_key)) - except zipfile.BadZipFile as e: - txt = logmsg(verbose, txt, indent + "- {} is not a zip file\n" - .format(crx_path)) - return txt + con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, + public_key)) def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, @@ -198,81 +172,48 @@ def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, txt = logmsg(verbose, txt, indent + "- updating using {}\n".format(datepath)) - if not db_path.exists(): - raise IncrementalSqliteUpdateError("db file not found") + if not os.path.exists(db_path): + raise SqliteUpdateError("db file not found") - with sqlite3.connect(str(db_path)) as con: + with sqlite3.connect(db_path) as con: parse_and_insert_overview(ext_id, date, datepath, con) - crx_path = next(datepath.glob("*.crx"), None) + crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) - etag = get_etag(datepath) + etag = get_etag(ext_id, datepath, con) etag_already_in_db = next( - con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, - )))[0] + con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[ + 0] if etag and not etag_already_in_db: if crx_path: - parse_and_insert_crx(ext_id, date, datepath, con, verbose, - indent) + parse_and_insert_crx(ext_id, date, datepath, con) else: - raise IncrementalSqliteUpdateError( + raise SqliteUpdateError( "etag not in db and no crx file present") return txt -def update_sqlite_full(db_path, archivedir, ext_id, verbose, indent): - txt = "" - - if db_path.exists(): - os.remove(db_path) - - with tempfile.TemporaryDirectory() as tmpdir: - tar = archive_file(archivedir, ext_id) - with tarfile.open(tar) as t: - t.extractall(tmpdir) - iddir = Path(tmpdir) / ext_id - - with sqlite3.connect(str(db_path)) as con: - setup_tables(con) - for datepath in sorted(iddir.iterdir()): - date = datepath.name - updatetxt = update_sqlite_incremental( - db_path, datepath, ext_id, date, verbose, indent) - txt = logmsg(verbose, txt, updatetxt) - - return txt - - def update_sqlite(archivedir, tmptardir, ext_id, date, verbose, indent): txt = "" - - datepath = Path(tmptardir) / date - archivedir = Path(archivedir) indent2 = indent + 4 * " " + datepath = os.path.join(tmptardir, date) + txt = logmsg(verbose, txt, indent + "* extracting information into SQLite db...\n") - db_path = Path(archivedir) / ext_id[:3] / (ext_id + ".sqlite") + db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite") + txt = logmsg(verbose, txt, + indent2 + "- attempting incremental update...\n") try: - txt = logmsg(verbose, txt, - indent2 + "- attempting incremental update...\n") updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent2) txt = logmsg(verbose, txt, updatetxt) - except IncrementalSqliteUpdateError as e: - txt = logmsg(verbose, txt, indent2 + - "- incremental update failed: {}\n".format(e.reason)) - txt = logmsg(verbose, txt, indent2 + "- regenerating full db...\n") - try: - fullmsg = update_sqlite_full(db_path, archivedir, ext_id, verbose, - indent2) - txt = logmsg(verbose, txt, fullmsg) - except IncrementalSqliteUpdateError as e: - txt = logmsg(verbose, txt, indent2 + - "- full sqlite update failed: {}, giving up\n".format( - e.reason)) + except SqliteUpdateError as e: + txt = logmsg( + verbose, txt, + indent2 + "- incremental update failed: {}\n".format(e.reason)) return txt diff --git a/create_db b/create_db new file mode 100755 index 0000000..3b22a1f --- /dev/null +++ b/create_db @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2016,2017 The University of Sheffield, UK +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import getopt +import os +import sys +import glob +import tarfile +import tempfile + +from ExtensionCrawler.sqlite import * + + +def setup_tables(con): + con.execute("""CREATE TABLE review (""" + """id INTEGER PRIMARY KEY,""" + """extid TEXT,""" + """date TEXT,""" + """user TEXT,""" + """reviewdate TEXT,""" + """rating TEXT,""" + """comment TEXT""" + """)""") + con.execute("""CREATE TABLE category (""" + """extid TEXT,""" + """date TEXT,""" + """category TEXT,""" + """PRIMARY KEY (extid, date, category)""" + """)""") + con.execute("""CREATE TABLE permission (""" + """crx_etag TEXT,""" + """permission TEXT,""" + """PRIMARY KEY (crx_etag, permission)""" + """)""") + con.execute("""CREATE TABLE crx (""" + """etag TEXT PRIMARY KEY,""" + """filename TEXT,""" + """publickey BLOB""" + """)""") + con.execute("""CREATE TABLE extension (""" + """extid TEXT,""" + """date TEXT,""" + """name TEXT,""" + """version TEXT,""" + """description TEXT,""" + """downloads INTEGER,""" + """fulldescription TEXT,""" + """developer TEXT,""" + """crx_etag TEXT,""" + """lastupdated TEXT,""" + """crx_status INTEGER,""" + """overview_status INTEGER,""" + """PRIMARY KEY (extid, date),""" + """FOREIGN KEY (crx_etag) REFERENCES crx(etag)""" + """)""") + + +def help(): + print("create_db [OPTION]") + print(" -h print this help text") + print(" -a= archive directory") + print(" -p= three-letter-prefix") + + +def main(argv): + basedir = "archive" + prefix = "" + try: + opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="]) + except getopt.GetoptError: + help() + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + help() + sys.exit() + elif opt in ("-a", "--archive"): + basedir = arg + elif opt in ("-p", "--prefix"): + prefix = arg + + archive_dir = os.path.join(basedir, "data") + threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*")) + for threeletterdir in threeletterdirs: + for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]): + tarpath = os.path.join(threeletterdir, ext_id + ".tar") + dbpath = os.path.join(threeletterdir, ext_id + ".sqlite") + if os.path.exists(dbpath): + os.remove(dbpath) + with tempfile.TemporaryDirectory() as tmpdir: + with tarfile.open(tarpath) as t: + t.extractall(tmpdir) + iddir = os.path.join(tmpdir, ext_id) + + with sqlite3.connect(dbpath) as con: + setup_tables(con) + for date in sorted(os.listdir(iddir)): + datepath = os.path.join(iddir, date) + print( + update_sqlite_incremental(dbpath, datepath, ext_id, + date, True, "")) + + +if __name__ == "__main__": + main(sys.argv[1:]) From 2e6323c8c58ab0fd8f4dce35d9280309f87b613b Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 17 Jun 2017 18:15:08 +0100 Subject: [PATCH 13/16] Report number of extensions for which the SQL database was updated. --- ExtensionCrawler/archive.py | 16 +++++++++++----- ExtensionCrawler/sqlite.py | 4 +++- crawler | 9 +++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 679f77c..8ff621a 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -76,7 +76,7 @@ class RequestResult: class UpdateResult: def __init__(self, id, is_new, exception, res_overview, res_crx, - res_reviews, res_support,res_sql): + res_reviews, res_support,res_sql, sql_update): self.id = id self.new = is_new self.exception = exception @@ -85,6 +85,7 @@ class UpdateResult: self.res_reviews = res_reviews self.res_support = res_support self.res_sql = res_sql + self.sql_update = sql_update def is_new(self): return self.new @@ -131,6 +132,9 @@ class UpdateResult: def sql_exception(self): return self.res_sql is not None + + def sql_success(self): + return self.sql_update def write_text(tardir, date, fname, text): @@ -359,6 +363,7 @@ def update_extension(archivedir, verbose, forums, ext_id): is_new = False tar_exception = None sql_exception = None + sql_success = False tmptardir = "" tmptar = "" @@ -385,7 +390,7 @@ def update_extension(archivedir, verbose, forums, ext_id): logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) tar_exception = e return UpdateResult(ext_id, is_new, tar_exception, res_overview, - res_crx, res_reviews, res_support, sql_exception) + res_crx, res_reviews, res_support, sql_exception, False) res_overview, msg_overview = update_overview(tmptardir, date, verbose, ext_id) @@ -449,12 +454,13 @@ def update_extension(archivedir, verbose, forums, ext_id): pass try: - msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, + sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, verbose, 11 * " ") logtxt = logmsg(verbose, logtxt, msg_updatesqlite) + except Exception as e: logtxt = logmsg(verbose, logtxt, - " * Eventually failed create sqlite files") + " * Exception during update of sqlite db ") logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) sql_exception = e @@ -477,7 +483,7 @@ def update_extension(archivedir, verbose, forums, ext_id): log(verbose, logtxt) return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx, - res_reviews, res_support, sql_exception) + res_reviews, res_support, sql_exception, sql_success) def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 369379d..8d463f1 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -195,6 +195,7 @@ def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, def update_sqlite(archivedir, tmptardir, ext_id, date, verbose, indent): + update_successful = False txt = "" indent2 = indent + 4 * " " @@ -211,9 +212,10 @@ def update_sqlite(archivedir, tmptardir, ext_id, date, verbose, indent): updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent2) txt = logmsg(verbose, txt, updatetxt) + update_successful = True except SqliteUpdateError as e: txt = logmsg( verbose, txt, indent2 + "- incremental update failed: {}\n".format(e.reason)) - return txt + return update_successful, txt diff --git a/crawler b/crawler index bb76dd9..9818a41 100755 --- a/crawler +++ b/crawler @@ -81,11 +81,18 @@ def log_failures_to_file(dir, today, res): sorted(map(lambda x: x.id, filter(lambda x: x.corrupt_tar(), res))), "") write_log(dir, today + "-file-corruption.log", file_corruption) + sql_exception = reduce( lambda x, y: x + "\n" + y, sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))), "") write_log(dir, today + "-sql-exception.log", sql_exception) + + sql_success = reduce( + lambda x, y: x + "\n" + y, + sorted(map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))), + "") + write_log(dir, today + "-sql-not-updated.log", sql_success) def log_summary(verbose, res, stderr=False, runtime=0): @@ -104,6 +111,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): not_modified = len(list(filter(lambda x: x.not_modified(), res))) corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res)) sql_exception = len(list(filter(lambda x: x.sql_exception(), res))) + sql_success = len(list(filter(lambda x: x.sql_success(), res))) new = len(list(filter(lambda x: x.is_new(), res))) updated = len( @@ -114,6 +122,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): p(" Updated {} out of {} extensions successfully\n".format( str(success), str(total))) p(" Updated extensions: {:8d}\n".format(updated)) + p(" Updated SQL databases: {:8d}\n".format(sql_success)) p(" New extensions: {:8d}\n".format(new)) p(" Not authorized: {:8d}\n".format(not_authorized)) p(" Raised Google DDOS: {:8d}\n".format(raised_ddos)) From 85c8f6a54631d9c3265a4b07c0d3470339864ed3 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 17 Jun 2017 18:19:44 +0100 Subject: [PATCH 14/16] Pass is_new flag to sqlite update. --- ExtensionCrawler/archive.py | 2 +- ExtensionCrawler/sqlite.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 8ff621a..e23e69f 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -454,7 +454,7 @@ def update_extension(archivedir, verbose, forums, ext_id): pass try: - sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, + sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, 11 * " ") logtxt = logmsg(verbose, logtxt, msg_updatesqlite) diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 8d463f1..9633e05 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -165,7 +165,7 @@ def parse_and_insert_crx(ext_id, date, datepath, con): public_key)) -def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, +def update_sqlite_incremental(db_path, datepath, ext_id, date, is_new,verbose, indent): txt = "" From 66eff6780d8d9e02045be8a62d47bbf0cb6809e2 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sat, 17 Jun 2017 18:26:04 +0100 Subject: [PATCH 15/16] Fixed passign is_new. --- ExtensionCrawler/sqlite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 9633e05..e3f7576 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -165,7 +165,7 @@ def parse_and_insert_crx(ext_id, date, datepath, con): public_key)) -def update_sqlite_incremental(db_path, datepath, ext_id, date, is_new,verbose, +def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, indent): txt = "" @@ -194,7 +194,7 @@ def update_sqlite_incremental(db_path, datepath, ext_id, date, is_new,verbose, return txt -def update_sqlite(archivedir, tmptardir, ext_id, date, verbose, indent): +def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent): update_successful = False txt = "" indent2 = indent + 4 * " " From d9195c8174c067933fd883c78781a9c9a90cb73f Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sun, 18 Jun 2017 15:36:21 +0100 Subject: [PATCH 16/16] Max. number of concurrent download can now be configured via command line. --- ExtensionCrawler/archive.py | 4 ++-- crawler | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index e23e69f..4992a62 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -486,7 +486,7 @@ def update_extension(archivedir, verbose, forums, ext_id): res_reviews, res_support, sql_exception, sql_success) -def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): +def update_extensions(archivedir, verbose, parallel, forums_ext_ids, ext_ids): ext_with_forums = [] ext_without_forums = [] ext_ids = list(set(ext_ids) - set(forums_ext_ids)) @@ -509,7 +509,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): log(verbose, " Updating {} extensions excluding forums (parallel))\n".format( len(parallel_ids))) - with Pool(12) as p: + with Pool(parallel) as p: ext_without_forums = list( p.map( partial(update_extension, archivedir, verbose, False), diff --git a/crawler b/crawler index 9818a41..45e050e 100755 --- a/crawler +++ b/crawler @@ -154,10 +154,11 @@ def help(): def main(argv): today = datetime.datetime.now(datetime.timezone.utc).isoformat() basedir = "archive" + parallel = 24 verbose = True discover = False try: - opts, args = getopt.getopt(argv, "hsda:", ["archive="]) + opts, args = getopt.getopt(argv, "hsdap:", ["archive=",'parallel=']) except getopt.GetoptError: help() sys.exit(2) @@ -167,6 +168,8 @@ def main(argv): sys.exit() elif opt in ("-a", "--archive"): basedir = arg + elif opt in ("-p", "--parallel"): + parallel = int(arg) elif opt == '-s': verbose = False elif opt == '-d': @@ -183,10 +186,11 @@ def main(argv): start_time = time.time() log(verbose, "Configuration:\n") - log(verbose, " Base dir: {}\n".format(basedir)) - log(verbose, " Archive dir: {}\n".format(archive_dir)) - log(verbose, " Conf. dir: {}\n".format(conf_dir)) - log(verbose, " Discover new ext.: {}\n".format(discover)) + log(verbose, " Base dir: {}\n".format(basedir)) + log(verbose, " Archive directory: {}\n".format(archive_dir)) + log(verbose, " Configuration directory: {}\n".format(conf_dir)) + log(verbose, " Discover new extensions: {}\n".format(discover)) + log(verbose, " Max num. of concurrent downloads: {}\n".format(parallel)) log(verbose, "\n") forum_ext_ids = get_forum_ext_ids(conf_dir, verbose) @@ -197,7 +201,7 @@ def main(argv): discovered_ids = get_new_ids(verbose, known_ids) ext_ids = list(set(discovered_ids) | set(known_ids)) - res = update_extensions(archive_dir, verbose, forum_ext_ids, ext_ids) + res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, ext_ids) # We re-try (once) the extensions with unknown exceptions, as # they are often temporary