diff --git a/ExtensionCrawler/db.py b/ExtensionCrawler/db.py index 8c04301..0ed7dfc 100644 --- a/ExtensionCrawler/db.py +++ b/ExtensionCrawler/db.py @@ -420,42 +420,40 @@ def update_db_incremental(tmptardir, ext_id, date): if etag: try: parse_and_insert_crx(ext_id, date, datepath, con) - except zipfile.BadZipfile as e: - log_warning( - "* WARNING: the found crx file is not a zip file, exception: {}". - format(str(e)), 3, ext_id) + except Exception as e: + log_exception("Exception when parsing crx", 3, ext_id) else: crx_status = get_crx_status(datepath) if crx_status != 401 and crx_status != 204 and crx_status != 404: log_warning("* WARNING: could not find etag", 3, ext_id) - parse_and_insert_overview(ext_id, date, datepath, con) - parse_and_insert_status(ext_id, date, datepath, con) + try: + parse_and_insert_overview(ext_id, date, datepath, con) + except Exception as e: + log_exception("Exception when parsing overview", 3, ext_id) + + try: + parse_and_insert_status(ext_id, date, datepath, con) + except Exception as e: + log_exception("Exception when parsing status", 3, ext_id) reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text")) for reviewpath in reviewpaths: try: parse_and_insert_review(ext_id, date, reviewpath, con) - except json.decoder.JSONDecodeError as e: - log_warning( - "* Could not parse review file, exception: {}".format( - str(e)), 3, ext_id) + except Exception as e: + log_exception("Exception when parsing review", 3, ext_id) supportpaths = glob.glob(os.path.join(datepath, "support*-*.text")) for supportpath in supportpaths: try: parse_and_insert_support(ext_id, date, supportpath, con) - except json.decoder.JSONDecodeError as e: - log_warning( - "* Could not parse support file, exception: {}".format( - str(e)), 3, ext_id) + except Exception as e: + log_exception("Exception when parsing support", 3, ext_id) repliespaths = glob.glob(os.path.join(datepath, "*replies.text")) for repliespath in repliespaths: try: parse_and_insert_replies(ext_id, date, repliespath, con) - except json.decoder.JSONDecodeError as e: - log_warning( - "* Could not parse reply file, exception: {}".format( - str(e)), 3, ext_id) - con.commit() + except Exception as e: + log_exception("Exception when parsing reply", 3, ext_id) diff --git a/ExtensionCrawler/dbbackend/mysql_backend.py b/ExtensionCrawler/dbbackend/mysql_backend.py index 21be609..11b7cf3 100644 --- a/ExtensionCrawler/dbbackend/mysql_backend.py +++ b/ExtensionCrawler/dbbackend/mysql_backend.py @@ -52,6 +52,9 @@ class MysqlBackend: if db is not None: db.close() db = None + db = MySQLdb.connect(**self.dbargs) + db.autocommit = True + self.cursor = db.cursor() except Exception as e2: log_error("Surpressed exception: {}".format(str(e2)), 3, self.ext_id) raise last_exception @@ -72,6 +75,7 @@ class MysqlBackend: global db if db is None: db = MySQLdb.connect(**self.dbargs) + db.autocommit = True self.cursor = db.cursor() return self @@ -84,9 +88,6 @@ class MysqlBackend: except Exception as e: log_error("Surpressed exception: {}".format(str(e)), 3, self.ext_id) - def commit(self): - db.commit() - def get_single_value(self, query, args): self.retry(lambda: self.cursor.execute(query, args)) diff --git a/create-db b/create-db index 1fcbd98..824f611 100755 --- a/create-db +++ b/create-db @@ -34,19 +34,21 @@ from ExtensionCrawler.util import log_info, log_warning, log_error, log_exceptio def help(): - print("create-db [OPTION]") - print(" -h print this help text") - print(" -a archive directory") - print(" -p three-letter-prefix") - print(" -e file with extension ids") - print(" --from-date only process information gathered after this date") - print(" (compared lexographically)") - print(" -t number of parallel threads") - print(" -n process chunk n where n in [1,N]") - print(" -N ") + print("""create-db [OPTION]""") + print(""" -h print this help text""") + print(""" -a archive directory""") + print(""" -p three-letter-prefix""") + print(""" -e file with extension ids""") + print(""" --from-date only process information gathered after""" + """ this date (compared lexographically)""") + print(""" --until-date only process information gathered before""" + """ this date (compared lexographically)""") + print(""" -t number of parallel threads""") + print(""" -n process chunk n where n in [1,N]""") + print(""" -N """) -def process_id(from_date, path): +def process_id(from_date, until_date, path): start = time.time() with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(path) as t: @@ -57,7 +59,8 @@ def process_id(from_date, path): iddir = os.path.join(tmpdir, extid) for date in sorted(os.listdir(iddir)): - if from_date is not None and date < from_date: + if (from_date is not None and date < from_date) or \ + (until_date is not None and date > until_date): log_info("* Skipping {}".format(date), 2, extid) continue try: @@ -97,13 +100,14 @@ def parse_args(argv): taskid = 1 maxtaskid = 1 from_date = None + until_date = None paths = [] try: opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [ "archive=", "prefix=", "extidlistfile=", "threads=", "taskid=", - "maxtaskid=", "from-date=", "help" + "maxtaskid=", "from-date=", "until-date=", "help" ]) except getopt.GetoptError: help() @@ -126,6 +130,8 @@ def parse_args(argv): maxtaskid = int(arg) elif opt in ("--from-date"): from_date = arg + elif opt in ("--until-date"): + until_date = arg if paths == []: paths = list(find(archive, "*")) @@ -136,16 +142,16 @@ def parse_args(argv): else: paths = paths[(taskid - 1) * chunksize:taskid * chunksize] - return paths, parallel, from_date + return paths, parallel, from_date, until_date def main(argv): logging.basicConfig(level=logging.INFO, format=const_log_format()) - paths, parallel, from_date = parse_args(argv) + paths, parallel, from_date, until_date = parse_args(argv) with Pool(processes=parallel) as p: - p.map(partial(process_id, from_date), paths) + p.map(partial(process_id, from_date, until_date), paths) if __name__ == "__main__":