Merge branch 'master' into production

2017-06-20 20:03:10 +01:00 · 2017-06-20 20:03:10 +01:00 · 8dbd535e3e
parent 1f403ff2b5 d9ebe265ae
commit 8dbd535e3e
5 changed files with 327 additions and 204 deletions
--- a/.gitignore
+++ b/.gitignore
@ -58,4 +58,7 @@ docs/_build/
 # PyBuilder
 target/

+# vi
+*.swp
+
 archive
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@ -76,7 +76,7 @@ class RequestResult:

 class UpdateResult:
    def __init__(self, id, is_new, exception, res_overview, res_crx,
-                 res_reviews, res_support,res_sql, sql_update):
+                 res_reviews, res_support, res_sql, sql_update):
        self.id = id
        self.new = is_new
        self.exception = exception
@ -111,18 +111,18 @@ class UpdateResult:
            (self.res_support is not None and self.res_support.not_found()))

    def has_exception(self):
-        return (
-            self.res_overview.has_exception() or
-            self.res_crx.has_exception() or
-            (self.res_reviews is not None and self.res_reviews.has_exception())
-            or (self.res_support is not None and
-                self.res_support.has_exception()))
+        return (self.res_overview.has_exception() or
+                self.res_crx.has_exception() or
+                (self.res_reviews is not None and
+                 self.res_reviews.has_exception()) or (
+                     self.res_support is not None and
+                     self.res_support.has_exception()))

    def raised_google_ddos(self):
-        return (
-            (self.res_reviews is not None and self.res_reviews.not_available())
-            or (self.res_support is not None and
-                self.res_support.not_available()))
+        return ((self.res_reviews is not None and
+                 self.res_reviews.not_available()) or
+                (self.res_support is not None and
+                 self.res_support.not_available()))

    def not_modified(self):
        return self.res_crx.not_modified()
@ -132,7 +132,7 @@ class UpdateResult:

    def sql_exception(self):
        return self.res_sql is not None
-    
+
    def sql_success(self):
        return self.sql_update

@ -261,11 +261,10 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
            extfilename = "default.crx"

        if res.status_code == 304:
-            res = requests.head(
+            etag = requests.head(
                const_download_url().format(ext_id),
                timeout=10,
-                allow_redirects=True)
-            etag = res.headers.get('Etag')
+                allow_redirects=True).headers.get('ETag')
            write_text(tmptardir, date, extfilename + ".etag", etag)
            logtxt = logmsg(verbose, logtxt, (
                "               - checking etag, last: {}\n" +
@ -389,7 +388,8 @@ def update_extension(archivedir, verbose, forums, ext_id):
        logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
        tar_exception = e
        return UpdateResult(ext_id, is_new, tar_exception, res_overview,
-                            res_crx, res_reviews, res_support, sql_exception, False)
+                            res_crx, res_reviews, res_support, sql_exception,
+                            False)

    res_overview, msg_overview = update_overview(tmptardir, date, verbose,
                                                 ext_id)
@ -453,10 +453,11 @@ def update_extension(archivedir, verbose, forums, ext_id):
            pass

    try:
-        sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new,
-                                         verbose, 11 * " ")
+        logtxt = logmsg(verbose, logtxt, "           * Updating db...\n")
+        msg_updatesqlite = update_sqlite_incremental(
+            archivedir, tmptardir, ext_id, date, verbose, 15 * " ")
        logtxt = logmsg(verbose, logtxt, msg_updatesqlite)
-
+        sql_success = True
    except Exception as e:
        logtxt = logmsg(verbose, logtxt,
                        "           * Exception during update of sqlite db ")
--- a/ExtensionCrawler/config.py
+++ b/ExtensionCrawler/config.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 #
 # Copyright (C) 2016,2017 The University of Sheffield, UK
-# 
+#
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -85,3 +85,8 @@ def get_local_archive_dir(id):
 def archive_file(archivedir, ext_id):
    return os.path.join(
        str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
+
+
+def db_file(archivedir, ext_id):
+    return os.path.join(archivedir,
+                        get_local_archive_dir(ext_id), ext_id + ".sqlite")
--- a/ExtensionCrawler/sqlite.py
+++ b/ExtensionCrawler/sqlite.py
@ -18,7 +18,6 @@
 from ExtensionCrawler.config import *
 from ExtensionCrawler.util import *
 from ExtensionCrawler.crx import *
-
 from ExtensionCrawler.archive import *

 import sqlite3
@ -27,33 +26,82 @@ from bs4 import BeautifulSoup
 from zipfile import ZipFile
 import json
 import os
-import tempfile
-import tarfile
 import glob


-class SqliteUpdateError(Exception):
-    def __init__(self, reason="unknown"):
-        self.reason = reason
+def setup_tables(con):
+    con.execute("""CREATE TABLE review ("""
+                """id INTEGER PRIMARY KEY,"""
+                """extid TEXT,"""
+                """date TEXT,"""
+                """author TEXT,"""
+                """displayname TEXT,"""
+                """reviewdate INTEGER,"""
+                """rating INTEGER,"""
+                """language TEXT,"""
+                """shortauthor TEXT,"""
+                """comment TEXT"""
+                """)""")
+    con.execute("""CREATE TABLE category ("""
+                """extid TEXT,"""
+                """date TEXT,"""
+                """category TEXT,"""
+                """PRIMARY KEY (extid, date, category)"""
+                """)""")
+    con.execute("""CREATE TABLE permission ("""
+                """crx_etag TEXT,"""
+                """permission TEXT,"""
+                """PRIMARY KEY (crx_etag, permission)"""
+                """)""")
+    con.execute("""CREATE TABLE crx ("""
+                """etag TEXT PRIMARY KEY,"""
+                """filename TEXT,"""
+                """publickey BLOB"""
+                """)""")
+    con.execute("""CREATE TABLE status ("""
+                """extid TEXT,"""
+                """date TEXT,"""
+                """crx_status INTEGER,"""
+                """overview_status INTEGER,"""
+                """overview_exception TEXT,"""
+                """PRIMARY KEY (extid, date)"""
+                """)""")
+    con.execute("""CREATE TABLE extension ("""
+                """extid TEXT,"""
+                """date TEXT,"""
+                """name TEXT,"""
+                """version TEXT,"""
+                """description TEXT,"""
+                """downloads INTEGER,"""
+                """fulldescription TEXT,"""
+                """developer TEXT,"""
+                """crx_etag TEXT,"""
+                """lastupdated TEXT,"""
+                """PRIMARY KEY (extid, date),"""
+                """FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
+                """)""")


-def get_etag(ext_id, datepath, con):
-    #Trying etag file
-    etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None)
-    if etagpath:
-        with open(etagpath) as f:
-            return f.read()
+def get_etag(ext_id, datepath, con, verbose, indent):
+    txt = ""

-    #Trying to parse header file for etag
+    # Trying to parse header file for etag
    headerpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
    if headerpath:
        with open(headerpath) as f:
-            headers = eval(f.read())
-            if "ETag" in headers:
-                return headers["ETag"]
+            content = f.read()
+            try:
+                headers = eval(content)
+                if "ETag" in headers:
+                    return headers["ETag"], txt
+            except Exception:
+                txt = logmsg(
+                    verbose, txt,
+                    indent + "* WARNING: could not parse crx header file")
+                pass

-    #Trying to look up previous etag in database
+    # Trying to look up previous etag in database
    linkpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
    if linkpath:
@ -66,12 +114,16 @@ def get_etag(ext_id, datepath, con):
                    "SELECT crx_etag FROM extension WHERE extid=? AND date=?",
                    (ext_id, linked_date)), None)
            if row:
-                return row[0]
+                return row[0], txt
+
+    return None, txt


 def get_overview_status(datepath):
-    with open(os.path.join(datepath, "overview.html.status")) as f:
-        return int(f.read())
+    overviewstatuspath = os.path.join(datepath, "overview.html.status")
+    if os.path.exists(overviewstatuspath):
+        with open(overviewstatuspath) as f:
+            return int(f.read())


 def get_crx_status(datepath):
@ -81,141 +133,222 @@ def get_crx_status(datepath):
        with open(statuspath) as f:
            return int(f.read())

+    # If the extension is paid, we will find a main.headers file...
+    statuspath = os.path.join(datepath, "main.status")
+    if os.path.exists(statuspath):
+        with open(statuspath) as f:
+            return int(f.read())
+
+    # ... or an default.crx.headers file
+    statuspath = os.path.join(datepath, "default.crx.status")
+    if os.path.exists(statuspath):
+        with open(statuspath) as f:
+            return int(f.read())
+
+
+def parse_and_insert_overview(ext_id, date, datepath, con, verbose, indent):
+    txt = ""

-def parse_and_insert_overview(ext_id, date, datepath, con):
    overview_path = os.path.join(datepath, "overview.html")
-    with open(overview_path) as overview_file:
-        contents = overview_file.read()
+    if os.path.exists(overview_path):
+        with open(overview_path) as overview_file:
+            contents = overview_file.read()

-        # Extract extension name
-        match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
-                          contents)
-        name = match.group(1) if match else None
+            # Extract extension name
+            match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
+                              contents)
+            name = match.group(1) if match else None

-        # Extract extension version
-        match = re.search("""<meta itemprop="version" content="(.*?)"\s*/>""",
-                          contents)
-        version = match.group(1) if match else None
+            # Extract extension version
+            match = re.search(
+                """<meta itemprop="version" content="(.*?)"\s*/>""", contents)
+            version = match.group(1) if match else None

-        # Extracts extension categories
-        match = re.search("""Attribute name="category">(.+?)</Attribute>""",
-                          contents)
-        categories = match.group(1).split(",") if match else None
+            # Extracts extension categories
+            match = re.search(
+                """Attribute name="category">(.+?)</Attribute>""", contents)
+            categories = match.group(1).split(",") if match else None

-        # Extracts the number of downloads
-        match = re.search("""user_count.*?(\d+)""", contents)
-        downloads = int(match.group(1)) if match else None
+            # Extracts the number of downloads
+            match = re.search("""user_count.*?(\d+)""", contents)
+            downloads = int(match.group(1)) if match else None

-        # Extracts the full extension description as it appears on the overview page
-        doc = BeautifulSoup(contents, 'html.parser')
+            # Extracts the full extension description as it appears on the
+            # overview page
+            doc = BeautifulSoup(contents, 'html.parser')

-        description_parent = doc.find('div', itemprop="description")
-        description = str(description_parent.contents[
-            0]) if description_parent and description_parent.contents else None
-        full_description = str(
-            description_parent.parent) if description_parent else None
+            description_parent = doc.find('div', itemprop="description")
+            description = str(
+                description_parent.contents[0]
+            ) if description_parent and description_parent.contents else None
+            full_description = str(
+                description_parent.parent) if description_parent else None

-        developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls)
-        developer = str(
-            developer_parent.contents[0]) if developer_parent else None
+            developer_parent = doc.find(
+                class_=lambda cls: cls and "e-f-Me" in cls)
+            developer = str(
+                developer_parent.contents[0]) if developer_parent else None

-        last_updated_parent = doc.find(
-            class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
-        last_updated = str(
-            last_updated_parent.contents[0]) if last_updated_parent else None
+            last_updated_parent = doc.find(
+                class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
+            last_updated = str(last_updated_parent.contents[
+                0]) if last_updated_parent else None

-        etag = get_etag(ext_id, datepath, con)
+            etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
+            txt = logmsg(verbose, txt, etag_msg)

-        overview_status = get_overview_status(datepath)
+            con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?)",
+                        (ext_id, date, name, version, description, downloads,
+                         full_description, developer, etag, last_updated))

-        crx_status = get_crx_status(datepath)
+            if categories:
+                for category in categories:
+                    con.execute("INSERT INTO category VALUES (?,?,?)",
+                                (ext_id, date, category))

-        con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
-                    (ext_id, date, name, version, description, downloads,
-                     full_description, developer, etag, last_updated,
-                     overview_status, crx_status))
-
-        if categories:
-            for category in categories:
-                con.execute("INSERT INTO category VALUES (?,?,?)",
-                            (ext_id, date, category))
+    return txt


-def parse_and_insert_crx(ext_id, date, datepath, con):
-    etag = get_etag(ext_id, datepath, con)
+def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent):
+    txt = ""
    crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
-    filename = os.path.basename(crx_path)
+    if crx_path:
+        filename = os.path.basename(crx_path)

-    with ZipFile(crx_path) as f:
-        with f.open("manifest.json") as m:
-            try:
+        with ZipFile(crx_path) as f:
+            etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
+            txt = logmsg(verbose, txt, etag_msg)
+            with f.open("manifest.json") as m:
+                raw_content = m.read()
                # There are some manifests that seem to have weird encodings...
-                manifest = json.loads(m.read().decode("utf-8-sig"))
+                try:
+                    content = raw_content.decode("utf-8-sig")
+                except UnicodeDecodeError:
+                    # Trying a different encoding, manifests are weird...
+                    content = raw_content.decode("latin1")
+
+                # Attempt to remove JavaScript-style comments from json
+                comment_regex = re.compile(r'\s*//.*')
+                multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/')
+                lines = content.splitlines()
+                for index, line in enumerate(lines):
+                    if comment_regex.match(
+                            line) or multiline_comment_regex.match(line):
+                        lines[index] = ""
+                content = "\n".join(lines)
+
+                manifest = json.loads(content, strict=False)
                if "permissions" in manifest:
                    for permission in manifest["permissions"]:
                        con.execute(
                            "INSERT OR REPLACE INTO permission VALUES (?,?)",
                            (etag, str(permission)))
-            except json.decoder.JSONDecodeError:
-                pass

-        public_key = read_crx(crx_path).pk
-
-        con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
-                                                       public_key))
-
-
-def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose,
-                              indent):
-    txt = ""
-
-    txt = logmsg(verbose, txt,
-                 indent + "- updating using {}\n".format(datepath))
-
-    if not os.path.exists(db_path):
-        raise SqliteUpdateError("db file not found")
-
-    with sqlite3.connect(db_path) as con:
-        parse_and_insert_overview(ext_id, date, datepath, con)
-
-        crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
-
-        etag = get_etag(ext_id, datepath, con)
-        etag_already_in_db = next(
-            con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
-                0]
-        if etag and not etag_already_in_db:
-            if crx_path:
-                parse_and_insert_crx(ext_id, date, datepath, con)
-            else:
-                raise SqliteUpdateError(
-                    "etag not in db and no crx file present")
+            public_key = read_crx(crx_path).pk

+            con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
+                                                           public_key))
    return txt


-def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent):
-    update_successful = False
+def get(d, k):
+    if d and k in d:
+        return d[k]
+
+
+def parse_and_insert_review(ext_id, date, reviewpath, con):
+    with open(reviewpath) as f:
+        content = f.read()
+        stripped = content[content.find('{"'):]
+        d = json.JSONDecoder().raw_decode(stripped)
+        annotations = get(next(iter(d), None), "annotations")
+        if annotations:
+            for review in d[0]["annotations"]:
+                timestamp = get(review, "timestamp")
+                starRating = get(review, "starRating")
+                comment = get(review, "comment")
+                displayname = get(get(review, "entity"), "displayName")
+                author = get(get(review, "entity"), "author")
+                language = get(get(review, "entity"), "language")
+                shortauthor = get(get(review, "entity"), "shortAuthor")
+
+                con.execute("INSERT INTO review VALUES(?,?,?,?,?,?,?,?,?,?)",
+                            (None, ext_id, date, author, displayname,
+                             timestamp, starRating, language, shortauthor,
+                             comment))
+
+
+def parse_and_insert_status(ext_id, date, datepath, con):
+    overview_status = get_overview_status(datepath)
+    crx_status = get_crx_status(datepath)
+
+    overviewexceptionpath = os.path.join(datepath, "overview.html.exception")
+    overview_exception = None
+    if os.path.exists(overviewexceptionpath):
+        with open(overviewexceptionpath) as f:
+            overview_exception = f.read()
+
+    con.execute("INSERT INTO status VALUES (?,?,?,?,?)",
+                (ext_id, date, crx_status, overview_status,
+                 overview_exception))
+
+
+def update_sqlite_incremental(archivedir, tmptardir, ext_id, date, verbose,
+                              indent):
    txt = ""
    indent2 = indent + 4 * " "

+    db_path = db_file(archivedir, ext_id)
    datepath = os.path.join(tmptardir, date)

    txt = logmsg(verbose, txt,
-                 indent + "* extracting information into SQLite db...\n")
+                 indent + "- updating with data from {}\n".format(date))

-    db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite")
+    if not os.path.exists(db_path):
+        txt = logmsg(verbose, txt,
+                     indent2 + "* db file does not exist, creating...\n")
+        with sqlite3.connect(db_path) as con:
+            setup_tables(con)

-    txt = logmsg(verbose, txt,
-                 indent2 + "- attempting incremental update...\n")
-    try:
-        updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date,
-                                              verbose, indent2)
-        txt = logmsg(verbose, txt, updatetxt)
-        update_successful = True
-    except SqliteUpdateError as e:
-        txt = logmsg(
-            verbose, txt,
-            indent2 + "- incremental update failed: {}\n".format(e.reason))
+    with sqlite3.connect(db_path) as con:
+        parse_and_insert_status(ext_id, date, datepath, con)

-    return update_successful, txt
+        parse_and_insert_overview(ext_id, date, datepath, con, verbose,
+                                  indent2)
+
+        etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent2)
+        txt = logmsg(verbose, txt, etag_msg)
+        etag_already_in_db = next(
+            con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
+                0]
+
+        if etag:
+            if not etag_already_in_db:
+                try:
+                    crx_msg = parse_and_insert_crx(ext_id, date, datepath, con,
+                                                   verbose, indent)
+                    txt = logmsg(verbose, txt, crx_msg)
+                except zipfile.BadZipfile as e:
+                    txt = logmsg(
+                        verbose, txt, indent2 +
+                        "* WARNING: the found crx file is not a zip file, exception: "
+                    )
+                    txt = logmsg(verbose, txt, str(e))
+                    txt = logmsg(verbose, txt, "\n")
+        else:
+            crx_status = get_crx_status(datepath)
+            if crx_status != 401 and crx_status != 204 and crx_status != 404:
+                txt = logmsg(verbose, txt,
+                             indent2 + "* WARNING: could not find etag\n")
+
+        reviewpaths = glob.glob(os.path.join(datepath, "reviews*.text"))
+        for reviewpath in reviewpaths:
+            try:
+                parse_and_insert_review(ext_id, date, reviewpath, con)
+            except json.decoder.JSONDecodeError as e:
+                txt = logmsg(
+                    verbose, txt,
+                    indent2 + "* Could not parse review file, exception: ")
+                txt = logmsg(verbose, txt, str(e))
+                txt = logmsg(verbose, txt, "\n")
+    return txt
--- a/109
+++ b/109
@ -22,52 +22,11 @@ import sys
 import glob
 import tarfile
 import tempfile
+import traceback
+from multiprocessing import Pool

 from ExtensionCrawler.sqlite import *
-
-
-def setup_tables(con):
-    con.execute("""CREATE TABLE review ("""
-                """id INTEGER PRIMARY KEY,"""
-                """extid TEXT,"""
-                """date TEXT,"""
-                """user TEXT,"""
-                """reviewdate TEXT,"""
-                """rating TEXT,"""
-                """comment TEXT"""
-                """)""")
-    con.execute("""CREATE TABLE category ("""
-                """extid TEXT,"""
-                """date TEXT,"""
-                """category TEXT,"""
-                """PRIMARY KEY (extid, date, category)"""
-                """)""")
-    con.execute("""CREATE TABLE permission ("""
-                """crx_etag TEXT,"""
-                """permission TEXT,"""
-                """PRIMARY KEY (crx_etag, permission)"""
-                """)""")
-    con.execute("""CREATE TABLE crx ("""
-                """etag TEXT PRIMARY KEY,"""
-                """filename TEXT,"""
-                """publickey BLOB"""
-                """)""")
-    con.execute("""CREATE TABLE extension ("""
-                """extid TEXT,"""
-                """date TEXT,"""
-                """name TEXT,"""
-                """version TEXT,"""
-                """description TEXT,"""
-                """downloads INTEGER,"""
-                """fulldescription TEXT,"""
-                """developer TEXT,"""
-                """crx_etag TEXT,"""
-                """lastupdated TEXT,"""
-                """crx_status INTEGER,"""
-                """overview_status INTEGER,"""
-                """PRIMARY KEY (extid, date),"""
-                """FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
-                """)""")
+from ExtensionCrawler.config import *


 def help():
@ -75,13 +34,45 @@ def help():
    print("    -h           print this help text")
    print("    -a=<DIR>     archive directory")
    print("    -p=<PREFIX>  three-letter-prefix")
+    print("    -t=<THREADS> number of parallel threads")
+
+
+def process_id(archivedir, verbose, ext_id):
+    txt = ""
+    txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
+
+    tarpath = archive_file(archivedir, ext_id)
+    dbpath = db_file(archivedir, ext_id)
+    if os.path.exists(dbpath):
+        os.remove(dbpath)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with tarfile.open(tarpath) as t:
+            t.extractall(tmpdir)
+            iddir = os.path.join(tmpdir, ext_id)
+
+            for date in sorted(os.listdir(iddir)):
+                try:
+                    update_txt = update_sqlite_incremental(
+                        archivedir, iddir, ext_id, date, True, "")
+                    txt = logmsg(verbose, txt, update_txt)
+                except Exception as e:
+                    txt = logmsg(verbose, txt,
+                                 "Exception when handling {} on {}:\n".format(
+                                     ext_id, date))
+                    txt = logmsg(verbose, txt, traceback.format_exc())
+
+    txt = logmsg(verbose, txt, "\n")
+
+    return txt


 def main(argv):
    basedir = "archive"
    prefix = ""
+    parallel = 8
    try:
-        opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="])
+        opts, args = getopt.getopt(argv, "ha:p:t:",
+                                   ["archive=", "prefix=", "threads="])
    except getopt.GetoptError:
        help()
        sys.exit(2)
@ -93,27 +84,17 @@ def main(argv):
            basedir = arg
        elif opt in ("-p", "--prefix"):
            prefix = arg
+        elif opt in ("-t", "--threads"):
+            parallel = int(arg)

-    archive_dir = os.path.join(basedir, "data")
-    threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*"))
+    archivedir = os.path.join(basedir, "data")
+    threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
    for threeletterdir in threeletterdirs:
-        for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]):
-            tarpath = os.path.join(threeletterdir, ext_id + ".tar")
-            dbpath = os.path.join(threeletterdir, ext_id + ".sqlite")
-            if os.path.exists(dbpath):
-                os.remove(dbpath)
-            with tempfile.TemporaryDirectory() as tmpdir:
-                with tarfile.open(tarpath) as t:
-                    t.extractall(tmpdir)
-                    iddir = os.path.join(tmpdir, ext_id)
-
-                    with sqlite3.connect(dbpath) as con:
-                        setup_tables(con)
-                    for date in sorted(os.listdir(iddir)):
-                        datepath = os.path.join(iddir, date)
-                        print(
-                            update_sqlite_incremental(dbpath, datepath, ext_id,
-                                                      date, True, ""))
+        ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
+        with Pool(parallel) as p:
+            for txt in p.imap(partial(process_id, archivedir, True), ext_ids):
+                sys.stdout.write(txt)
+                sys.stdout.flush()


 if __name__ == "__main__":