diff --git a/.gitignore b/.gitignore index 382f6f6..0267ec1 100644 --- a/.gitignore +++ b/.gitignore @@ -58,4 +58,7 @@ docs/_build/ # PyBuilder target/ +# vi +*.swp + archive diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 9f4ed0f..5072a15 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -76,7 +76,7 @@ class RequestResult: class UpdateResult: def __init__(self, id, is_new, exception, res_overview, res_crx, - res_reviews, res_support,res_sql, sql_update): + res_reviews, res_support, res_sql, sql_update): self.id = id self.new = is_new self.exception = exception @@ -111,18 +111,18 @@ class UpdateResult: (self.res_support is not None and self.res_support.not_found())) def has_exception(self): - return ( - self.res_overview.has_exception() or - self.res_crx.has_exception() or - (self.res_reviews is not None and self.res_reviews.has_exception()) - or (self.res_support is not None and - self.res_support.has_exception())) + return (self.res_overview.has_exception() or + self.res_crx.has_exception() or + (self.res_reviews is not None and + self.res_reviews.has_exception()) or ( + self.res_support is not None and + self.res_support.has_exception())) def raised_google_ddos(self): - return ( - (self.res_reviews is not None and self.res_reviews.not_available()) - or (self.res_support is not None and - self.res_support.not_available())) + return ((self.res_reviews is not None and + self.res_reviews.not_available()) or + (self.res_support is not None and + self.res_support.not_available())) def not_modified(self): return self.res_crx.not_modified() @@ -132,7 +132,7 @@ class UpdateResult: def sql_exception(self): return self.res_sql is not None - + def sql_success(self): return self.sql_update @@ -261,11 +261,10 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date): extfilename = "default.crx" if res.status_code == 304: - res = requests.head( + etag = requests.head( const_download_url().format(ext_id), timeout=10, - allow_redirects=True) - etag = res.headers.get('Etag') + allow_redirects=True).headers.get('ETag') write_text(tmptardir, date, extfilename + ".etag", etag) logtxt = logmsg(verbose, logtxt, ( " - checking etag, last: {}\n" + @@ -389,7 +388,8 @@ def update_extension(archivedir, verbose, forums, ext_id): logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) tar_exception = e return UpdateResult(ext_id, is_new, tar_exception, res_overview, - res_crx, res_reviews, res_support, sql_exception, False) + res_crx, res_reviews, res_support, sql_exception, + False) res_overview, msg_overview = update_overview(tmptardir, date, verbose, ext_id) @@ -453,10 +453,11 @@ def update_extension(archivedir, verbose, forums, ext_id): pass try: - sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new, - verbose, 11 * " ") + logtxt = logmsg(verbose, logtxt, " * Updating db...\n") + msg_updatesqlite = update_sqlite_incremental( + archivedir, tmptardir, ext_id, date, verbose, 15 * " ") logtxt = logmsg(verbose, logtxt, msg_updatesqlite) - + sql_success = True except Exception as e: logtxt = logmsg(verbose, logtxt, " * Exception during update of sqlite db ") diff --git a/ExtensionCrawler/config.py b/ExtensionCrawler/config.py index d65f70d..61148ab 100644 --- a/ExtensionCrawler/config.py +++ b/ExtensionCrawler/config.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # # Copyright (C) 2016,2017 The University of Sheffield, UK -# +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -85,3 +85,8 @@ def get_local_archive_dir(id): def archive_file(archivedir, ext_id): return os.path.join( str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar") + + +def db_file(archivedir, ext_id): + return os.path.join(archivedir, + get_local_archive_dir(ext_id), ext_id + ".sqlite") diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 11e144c..e2bea15 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -18,7 +18,6 @@ from ExtensionCrawler.config import * from ExtensionCrawler.util import * from ExtensionCrawler.crx import * - from ExtensionCrawler.archive import * import sqlite3 @@ -27,33 +26,82 @@ from bs4 import BeautifulSoup from zipfile import ZipFile import json import os -import tempfile -import tarfile import glob -class SqliteUpdateError(Exception): - def __init__(self, reason="unknown"): - self.reason = reason +def setup_tables(con): + con.execute("""CREATE TABLE review (""" + """id INTEGER PRIMARY KEY,""" + """extid TEXT,""" + """date TEXT,""" + """author TEXT,""" + """displayname TEXT,""" + """reviewdate INTEGER,""" + """rating INTEGER,""" + """language TEXT,""" + """shortauthor TEXT,""" + """comment TEXT""" + """)""") + con.execute("""CREATE TABLE category (""" + """extid TEXT,""" + """date TEXT,""" + """category TEXT,""" + """PRIMARY KEY (extid, date, category)""" + """)""") + con.execute("""CREATE TABLE permission (""" + """crx_etag TEXT,""" + """permission TEXT,""" + """PRIMARY KEY (crx_etag, permission)""" + """)""") + con.execute("""CREATE TABLE crx (""" + """etag TEXT PRIMARY KEY,""" + """filename TEXT,""" + """publickey BLOB""" + """)""") + con.execute("""CREATE TABLE status (""" + """extid TEXT,""" + """date TEXT,""" + """crx_status INTEGER,""" + """overview_status INTEGER,""" + """overview_exception TEXT,""" + """PRIMARY KEY (extid, date)""" + """)""") + con.execute("""CREATE TABLE extension (""" + """extid TEXT,""" + """date TEXT,""" + """name TEXT,""" + """version TEXT,""" + """description TEXT,""" + """downloads INTEGER,""" + """fulldescription TEXT,""" + """developer TEXT,""" + """crx_etag TEXT,""" + """lastupdated TEXT,""" + """PRIMARY KEY (extid, date),""" + """FOREIGN KEY (crx_etag) REFERENCES crx(etag)""" + """)""") -def get_etag(ext_id, datepath, con): - #Trying etag file - etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None) - if etagpath: - with open(etagpath) as f: - return f.read() +def get_etag(ext_id, datepath, con, verbose, indent): + txt = "" - #Trying to parse header file for etag + # Trying to parse header file for etag headerpath = next( iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None) if headerpath: with open(headerpath) as f: - headers = eval(f.read()) - if "ETag" in headers: - return headers["ETag"] + content = f.read() + try: + headers = eval(content) + if "ETag" in headers: + return headers["ETag"], txt + except Exception: + txt = logmsg( + verbose, txt, + indent + "* WARNING: could not parse crx header file") + pass - #Trying to look up previous etag in database + # Trying to look up previous etag in database linkpath = next( iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None) if linkpath: @@ -66,12 +114,16 @@ def get_etag(ext_id, datepath, con): "SELECT crx_etag FROM extension WHERE extid=? AND date=?", (ext_id, linked_date)), None) if row: - return row[0] + return row[0], txt + + return None, txt def get_overview_status(datepath): - with open(os.path.join(datepath, "overview.html.status")) as f: - return int(f.read()) + overviewstatuspath = os.path.join(datepath, "overview.html.status") + if os.path.exists(overviewstatuspath): + with open(overviewstatuspath) as f: + return int(f.read()) def get_crx_status(datepath): @@ -81,141 +133,222 @@ def get_crx_status(datepath): with open(statuspath) as f: return int(f.read()) + # If the extension is paid, we will find a main.headers file... + statuspath = os.path.join(datepath, "main.status") + if os.path.exists(statuspath): + with open(statuspath) as f: + return int(f.read()) + + # ... or an default.crx.headers file + statuspath = os.path.join(datepath, "default.crx.status") + if os.path.exists(statuspath): + with open(statuspath) as f: + return int(f.read()) + + +def parse_and_insert_overview(ext_id, date, datepath, con, verbose, indent): + txt = "" -def parse_and_insert_overview(ext_id, date, datepath, con): overview_path = os.path.join(datepath, "overview.html") - with open(overview_path) as overview_file: - contents = overview_file.read() + if os.path.exists(overview_path): + with open(overview_path) as overview_file: + contents = overview_file.read() - # Extract extension name - match = re.search("""""", - contents) - name = match.group(1) if match else None + # Extract extension name + match = re.search("""""", + contents) + name = match.group(1) if match else None - # Extract extension version - match = re.search("""""", - contents) - version = match.group(1) if match else None + # Extract extension version + match = re.search( + """""", contents) + version = match.group(1) if match else None - # Extracts extension categories - match = re.search("""Attribute name="category">(.+?)""", - contents) - categories = match.group(1).split(",") if match else None + # Extracts extension categories + match = re.search( + """Attribute name="category">(.+?)""", contents) + categories = match.group(1).split(",") if match else None - # Extracts the number of downloads - match = re.search("""user_count.*?(\d+)""", contents) - downloads = int(match.group(1)) if match else None + # Extracts the number of downloads + match = re.search("""user_count.*?(\d+)""", contents) + downloads = int(match.group(1)) if match else None - # Extracts the full extension description as it appears on the overview page - doc = BeautifulSoup(contents, 'html.parser') + # Extracts the full extension description as it appears on the + # overview page + doc = BeautifulSoup(contents, 'html.parser') - description_parent = doc.find('div', itemprop="description") - description = str(description_parent.contents[ - 0]) if description_parent and description_parent.contents else None - full_description = str( - description_parent.parent) if description_parent else None + description_parent = doc.find('div', itemprop="description") + description = str( + description_parent.contents[0] + ) if description_parent and description_parent.contents else None + full_description = str( + description_parent.parent) if description_parent else None - developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls) - developer = str( - developer_parent.contents[0]) if developer_parent else None + developer_parent = doc.find( + class_=lambda cls: cls and "e-f-Me" in cls) + developer = str( + developer_parent.contents[0]) if developer_parent else None - last_updated_parent = doc.find( - class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls) - last_updated = str( - last_updated_parent.contents[0]) if last_updated_parent else None + last_updated_parent = doc.find( + class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls) + last_updated = str(last_updated_parent.contents[ + 0]) if last_updated_parent else None - etag = get_etag(ext_id, datepath, con) + etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent) + txt = logmsg(verbose, txt, etag_msg) - overview_status = get_overview_status(datepath) + con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?)", + (ext_id, date, name, version, description, downloads, + full_description, developer, etag, last_updated)) - crx_status = get_crx_status(datepath) + if categories: + for category in categories: + con.execute("INSERT INTO category VALUES (?,?,?)", + (ext_id, date, category)) - con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", - (ext_id, date, name, version, description, downloads, - full_description, developer, etag, last_updated, - overview_status, crx_status)) - - if categories: - for category in categories: - con.execute("INSERT INTO category VALUES (?,?,?)", - (ext_id, date, category)) + return txt -def parse_and_insert_crx(ext_id, date, datepath, con): - etag = get_etag(ext_id, datepath, con) +def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent): + txt = "" crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) - filename = os.path.basename(crx_path) + if crx_path: + filename = os.path.basename(crx_path) - with ZipFile(crx_path) as f: - with f.open("manifest.json") as m: - try: + with ZipFile(crx_path) as f: + etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent) + txt = logmsg(verbose, txt, etag_msg) + with f.open("manifest.json") as m: + raw_content = m.read() # There are some manifests that seem to have weird encodings... - manifest = json.loads(m.read().decode("utf-8-sig")) + try: + content = raw_content.decode("utf-8-sig") + except UnicodeDecodeError: + # Trying a different encoding, manifests are weird... + content = raw_content.decode("latin1") + + # Attempt to remove JavaScript-style comments from json + comment_regex = re.compile(r'\s*//.*') + multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/') + lines = content.splitlines() + for index, line in enumerate(lines): + if comment_regex.match( + line) or multiline_comment_regex.match(line): + lines[index] = "" + content = "\n".join(lines) + + manifest = json.loads(content, strict=False) if "permissions" in manifest: for permission in manifest["permissions"]: con.execute( "INSERT OR REPLACE INTO permission VALUES (?,?)", (etag, str(permission))) - except json.decoder.JSONDecodeError: - pass - public_key = read_crx(crx_path).pk - - con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, - public_key)) - - -def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, - indent): - txt = "" - - txt = logmsg(verbose, txt, - indent + "- updating using {}\n".format(datepath)) - - if not os.path.exists(db_path): - raise SqliteUpdateError("db file not found") - - with sqlite3.connect(db_path) as con: - parse_and_insert_overview(ext_id, date, datepath, con) - - crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) - - etag = get_etag(ext_id, datepath, con) - etag_already_in_db = next( - con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[ - 0] - if etag and not etag_already_in_db: - if crx_path: - parse_and_insert_crx(ext_id, date, datepath, con) - else: - raise SqliteUpdateError( - "etag not in db and no crx file present") + public_key = read_crx(crx_path).pk + con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, + public_key)) return txt -def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent): - update_successful = False +def get(d, k): + if d and k in d: + return d[k] + + +def parse_and_insert_review(ext_id, date, reviewpath, con): + with open(reviewpath) as f: + content = f.read() + stripped = content[content.find('{"'):] + d = json.JSONDecoder().raw_decode(stripped) + annotations = get(next(iter(d), None), "annotations") + if annotations: + for review in d[0]["annotations"]: + timestamp = get(review, "timestamp") + starRating = get(review, "starRating") + comment = get(review, "comment") + displayname = get(get(review, "entity"), "displayName") + author = get(get(review, "entity"), "author") + language = get(get(review, "entity"), "language") + shortauthor = get(get(review, "entity"), "shortAuthor") + + con.execute("INSERT INTO review VALUES(?,?,?,?,?,?,?,?,?,?)", + (None, ext_id, date, author, displayname, + timestamp, starRating, language, shortauthor, + comment)) + + +def parse_and_insert_status(ext_id, date, datepath, con): + overview_status = get_overview_status(datepath) + crx_status = get_crx_status(datepath) + + overviewexceptionpath = os.path.join(datepath, "overview.html.exception") + overview_exception = None + if os.path.exists(overviewexceptionpath): + with open(overviewexceptionpath) as f: + overview_exception = f.read() + + con.execute("INSERT INTO status VALUES (?,?,?,?,?)", + (ext_id, date, crx_status, overview_status, + overview_exception)) + + +def update_sqlite_incremental(archivedir, tmptardir, ext_id, date, verbose, + indent): txt = "" indent2 = indent + 4 * " " + db_path = db_file(archivedir, ext_id) datepath = os.path.join(tmptardir, date) txt = logmsg(verbose, txt, - indent + "* extracting information into SQLite db...\n") + indent + "- updating with data from {}\n".format(date)) - db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite") + if not os.path.exists(db_path): + txt = logmsg(verbose, txt, + indent2 + "* db file does not exist, creating...\n") + with sqlite3.connect(db_path) as con: + setup_tables(con) - txt = logmsg(verbose, txt, - indent2 + "- attempting incremental update...\n") - try: - updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, - verbose, indent2) - txt = logmsg(verbose, txt, updatetxt) - update_successful = True - except SqliteUpdateError as e: - txt = logmsg( - verbose, txt, - indent2 + "- incremental update failed: {}\n".format(e.reason)) + with sqlite3.connect(db_path) as con: + parse_and_insert_status(ext_id, date, datepath, con) - return update_successful, txt + parse_and_insert_overview(ext_id, date, datepath, con, verbose, + indent2) + + etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent2) + txt = logmsg(verbose, txt, etag_msg) + etag_already_in_db = next( + con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[ + 0] + + if etag: + if not etag_already_in_db: + try: + crx_msg = parse_and_insert_crx(ext_id, date, datepath, con, + verbose, indent) + txt = logmsg(verbose, txt, crx_msg) + except zipfile.BadZipfile as e: + txt = logmsg( + verbose, txt, indent2 + + "* WARNING: the found crx file is not a zip file, exception: " + ) + txt = logmsg(verbose, txt, str(e)) + txt = logmsg(verbose, txt, "\n") + else: + crx_status = get_crx_status(datepath) + if crx_status != 401 and crx_status != 204 and crx_status != 404: + txt = logmsg(verbose, txt, + indent2 + "* WARNING: could not find etag\n") + + reviewpaths = glob.glob(os.path.join(datepath, "reviews*.text")) + for reviewpath in reviewpaths: + try: + parse_and_insert_review(ext_id, date, reviewpath, con) + except json.decoder.JSONDecodeError as e: + txt = logmsg( + verbose, txt, + indent2 + "* Could not parse review file, exception: ") + txt = logmsg(verbose, txt, str(e)) + txt = logmsg(verbose, txt, "\n") + return txt diff --git a/create_db b/create_db index 3b22a1f..ca2ba6a 100755 --- a/create_db +++ b/create_db @@ -22,52 +22,11 @@ import sys import glob import tarfile import tempfile +import traceback +from multiprocessing import Pool from ExtensionCrawler.sqlite import * - - -def setup_tables(con): - con.execute("""CREATE TABLE review (""" - """id INTEGER PRIMARY KEY,""" - """extid TEXT,""" - """date TEXT,""" - """user TEXT,""" - """reviewdate TEXT,""" - """rating TEXT,""" - """comment TEXT""" - """)""") - con.execute("""CREATE TABLE category (""" - """extid TEXT,""" - """date TEXT,""" - """category TEXT,""" - """PRIMARY KEY (extid, date, category)""" - """)""") - con.execute("""CREATE TABLE permission (""" - """crx_etag TEXT,""" - """permission TEXT,""" - """PRIMARY KEY (crx_etag, permission)""" - """)""") - con.execute("""CREATE TABLE crx (""" - """etag TEXT PRIMARY KEY,""" - """filename TEXT,""" - """publickey BLOB""" - """)""") - con.execute("""CREATE TABLE extension (""" - """extid TEXT,""" - """date TEXT,""" - """name TEXT,""" - """version TEXT,""" - """description TEXT,""" - """downloads INTEGER,""" - """fulldescription TEXT,""" - """developer TEXT,""" - """crx_etag TEXT,""" - """lastupdated TEXT,""" - """crx_status INTEGER,""" - """overview_status INTEGER,""" - """PRIMARY KEY (extid, date),""" - """FOREIGN KEY (crx_etag) REFERENCES crx(etag)""" - """)""") +from ExtensionCrawler.config import * def help(): @@ -75,13 +34,45 @@ def help(): print(" -h print this help text") print(" -a= archive directory") print(" -p= three-letter-prefix") + print(" -t= number of parallel threads") + + +def process_id(archivedir, verbose, ext_id): + txt = "" + txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id)) + + tarpath = archive_file(archivedir, ext_id) + dbpath = db_file(archivedir, ext_id) + if os.path.exists(dbpath): + os.remove(dbpath) + with tempfile.TemporaryDirectory() as tmpdir: + with tarfile.open(tarpath) as t: + t.extractall(tmpdir) + iddir = os.path.join(tmpdir, ext_id) + + for date in sorted(os.listdir(iddir)): + try: + update_txt = update_sqlite_incremental( + archivedir, iddir, ext_id, date, True, "") + txt = logmsg(verbose, txt, update_txt) + except Exception as e: + txt = logmsg(verbose, txt, + "Exception when handling {} on {}:\n".format( + ext_id, date)) + txt = logmsg(verbose, txt, traceback.format_exc()) + + txt = logmsg(verbose, txt, "\n") + + return txt def main(argv): basedir = "archive" prefix = "" + parallel = 8 try: - opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="]) + opts, args = getopt.getopt(argv, "ha:p:t:", + ["archive=", "prefix=", "threads="]) except getopt.GetoptError: help() sys.exit(2) @@ -93,27 +84,17 @@ def main(argv): basedir = arg elif opt in ("-p", "--prefix"): prefix = arg + elif opt in ("-t", "--threads"): + parallel = int(arg) - archive_dir = os.path.join(basedir, "data") - threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*")) + archivedir = os.path.join(basedir, "data") + threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*")) for threeletterdir in threeletterdirs: - for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]): - tarpath = os.path.join(threeletterdir, ext_id + ".tar") - dbpath = os.path.join(threeletterdir, ext_id + ".sqlite") - if os.path.exists(dbpath): - os.remove(dbpath) - with tempfile.TemporaryDirectory() as tmpdir: - with tarfile.open(tarpath) as t: - t.extractall(tmpdir) - iddir = os.path.join(tmpdir, ext_id) - - with sqlite3.connect(dbpath) as con: - setup_tables(con) - for date in sorted(os.listdir(iddir)): - datepath = os.path.join(iddir, date) - print( - update_sqlite_incremental(dbpath, datepath, ext_id, - date, True, "")) + ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)])) + with Pool(parallel) as p: + for txt in p.imap(partial(process_id, archivedir, True), ext_ids): + sys.stdout.write(txt) + sys.stdout.flush() if __name__ == "__main__":