Merge with upstream.

2017-08-30 17:18:31 +01:00 · 2017-08-30 17:18:31 +01:00 · 3c9ccb4da9
parent 31ffa42a77 bde59c5040
commit 3c9ccb4da9
18 changed files with 205 additions and 621 deletions
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@ -34,14 +34,13 @@ import datetime
 import dateutil
 import dateutil.parser
 import requests
-import logging

 from ExtensionCrawler.config import (
    const_review_payload, const_review_search_url, const_download_url,
    get_local_archive_dir, const_overview_url, const_support_url,
    const_support_payload, const_review_search_payload, const_review_url)
-from ExtensionCrawler.util import google_dos_protection, value_of
-from ExtensionCrawler.sqlite import db_file, update_sqlite_incremental
+from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_exception
+from ExtensionCrawler.db import update_db_incremental


 class Error(Exception):
@ -223,11 +222,10 @@ def update_overview(tar, date, ext_id):
    res = None
    try:
        res = requests.get(const_overview_url(ext_id), timeout=10)
-        logging.info(8 * " " +
-                     "* overview page: {}".format(str(res.status_code)))
+        log_info("* overview page: {}".format(str(res.status_code)), 2, ext_id)
        store_request_text(tar, date, 'overview.html', res)
    except Exception as e:
-        logging.exception("Exception when retrieving overview page")
+        log_exception("Exception when retrieving overview page", 2, ext_id)
        write_text(tar, date, 'overview.html.exception',
                   traceback.format_exc())
        return RequestResult(res, e)
@ -266,8 +264,9 @@ def update_crx(archivedir, tmptardir, ext_id, date):
            stream=True,
            headers=headers,
            timeout=10)
-        logging.info(8 * " " + "* crx archive (Last: {}): {}".format(
-            value_of(last_crx_http_date, "n/a"), str(res.status_code)))
+        log_info("* crx archive (Last: {}): {}".format(
+            value_of(last_crx_http_date, "n/a"), str(res.status_code)), 2,
+                 ext_id)
        extfilename = os.path.basename(res.url)
        if re.search('&', extfilename):
            extfilename = "default.crx"
@ -278,12 +277,12 @@ def update_crx(archivedir, tmptardir, ext_id, date):
                timeout=10,
                allow_redirects=True).headers.get('ETag')
            write_text(tmptardir, date, extfilename + ".etag", etag)
-            logging.info(12 * " " +
-                         "- checking etag, last: {}".format(last_crx_etag))
-            logging.info(12 * " " + "              current: {}".format(etag))
+            log_info("- checking etag, last: {}".format(last_crx_etag), 3,
+                     ext_id)
+            log_info("              current: {}".format(etag), 3, ext_id)

            if (etag is not "") and (etag != last_crx_etag):
-                logging.info(12 * " " + "- downloading due to different etags")
+                log_info("- downloading due to different etags", 3, ext_id)

                res = requests.get(
                    const_download_url().format(ext_id),
@ -304,7 +303,7 @@ def update_crx(archivedir, tmptardir, ext_id, date):
            write_text(tmptardir, date, extfilename + ".etag",
                       res.headers.get("ETag"))
    except Exception as e:
-        logging.exception("Exception when updating crx")
+        log_exception("Exception when updating crx", 3, ext_id)
        write_text(tmptardir, date, extfilename + ".exception",
                   traceback.format_exc())
        return RequestResult(res, e)
@ -330,8 +329,8 @@ def update_reviews(tar, date, ext_id):
            const_review_url(),
            data=const_review_payload(ext_id, "0", "100"),
            timeout=10)
-        logging.info(8 * " " +
-                     "* review page   0-100: {}".format(str(res.status_code)))
+        log_info("* review page   0-100: {}".format(str(res.status_code)), 2,
+                 ext_id)
        store_request_text(tar, date, 'reviews000-099.text', res)
        pages += [res.text]

@ -340,8 +339,8 @@ def update_reviews(tar, date, ext_id):
            const_review_url(),
            data=const_review_payload(ext_id, "100", "100"),
            timeout=10)
-        logging.info(8 * " " + "* review page   100-200: {}".format(
-            str(res.status_code)))
+        log_info("* review page   100-200: {}".format(str(res.status_code)), 2,
+                 ext_id)
        store_request_text(tar, date, 'reviews100-199.text', res)
        pages += [res.text]

@ -354,11 +353,11 @@ def update_reviews(tar, date, ext_id):
                const_review_search_url(),
                data=const_review_search_payload(ext_id_author_tups),
                timeout=10)
-            logging.info(8 * " " + "* review page replies: {}".format(
-                str(res.status_code)))
+            log_info("* review page replies: {}".format(str(res.status_code)),
+                     2, ext_id)
            store_request_text(tar, date, 'reviewsreplies.text', res)
    except Exception as e:
-        logging.exception("Exception when updating reviews")
+        log_exception("Exception when updating reviews", 2, ext_id)
        write_text(tar, date, 'reviews.html.exception', traceback.format_exc())
        return RequestResult(res, e)
    return RequestResult(res)
@ -374,8 +373,8 @@ def update_support(tar, date, ext_id):
            const_support_url(),
            data=const_support_payload(ext_id, "0", "100"),
            timeout=10)
-        logging.info(8 * " " +
-                     "* support page   0-100: {}".format(str(res.status_code)))
+        log_info("* support page   0-100: {}".format(str(res.status_code)), 2,
+                 ext_id)
        store_request_text(tar, date, 'support000-099.text', res)
        pages += [res.text]

@ -384,8 +383,8 @@ def update_support(tar, date, ext_id):
            const_support_url(),
            data=const_support_payload(ext_id, "100", "100"),
            timeout=10)
-        logging.info(8 * " " +
-                     "* support page 100-200: {}".format(str(res.status_code)))
+        log_info("* support page 100-200: {}".format(str(res.status_code)), 2,
+                 ext_id)
        store_request_text(tar, date, 'support100-199.text', res)
        pages += [res.text]

@ -398,19 +397,19 @@ def update_support(tar, date, ext_id):
                const_review_search_url(),
                data=const_review_search_payload(ext_id_author_tups),
                timeout=10)
-            logging.info(8 * " " + "* support page replies: {}".format(
-                str(res.status_code)))
+            log_info("* support page replies: {}".format(str(res.status_code)),
+                     2, ext_id)
            store_request_text(tar, date, 'supportreplies.text', res)
    except Exception as e:
-        logging.exception("Exception when updating support pages")
+        log_exception("Exception when updating support pages", 2, ext_id)
        write_text(tar, date, 'support.html.exception', traceback.format_exc())
        return RequestResult(res, e)
    return RequestResult(res)


 def update_extension(archivedir, forums, ext_id):
-    logging.info(4 * " " + "Updating extension {}{}".format(
-        ext_id, " (including forums)" if forums else ""))
+    log_info("Updating extension {}".format(" (including forums)"
+                                            if forums else ""), 1, ext_id)
    is_new = False
    tar_exception = None
    sql_exception = None
@ -426,12 +425,12 @@ def update_extension(archivedir, forums, ext_id):
    try:
        tmpdir = tempfile.mkdtemp()
        tmptardir = os.path.join(tmpdir, ext_id)
-        logging.info(8 * " " + "* tmptardir = {}".format(tmptardir))
+        log_info("* tmptardir = {}".format(tmptardir), 2, ext_id)
        os.makedirs(
            os.path.join(archivedir, get_local_archive_dir(ext_id)),
            exist_ok=True)
    except Exception as e:
-        logging.exception(8 * " " + "* FATAL: cannot create tmpdir")
+        log_exception("* FATAL: cannot create tmpdir", 3, ext_id)
        tar_exception = e
        return UpdateResult(ext_id, is_new, tar_exception, None, None, None,
                            None, sql_exception, False)
@ -462,8 +461,7 @@ def update_extension(archivedir, forums, ext_id):
            if os.path.exists(tar):
                shutil.copyfile(tar, tardir + ".bak.tar")
        except Exception as e:
-            logging.exception(8 * " " +
-                              "* FATAL: cannot rename old tar archive")
+            log_exception("* FATAL: cannot rename old tar archive", 3, ext_id)
            tar_exception = e
            try:
                write_text(tardir, date, ext_id + ".tar.rename.exception",
@ -478,7 +476,7 @@ def update_extension(archivedir, forums, ext_id):
        ar.add(tmptardir, arcname=ext_id)
        ar.close()
    except Exception as e:
-        logging.exception(8 * " " + "* FATAL: cannot create tar archive")
+        log_exception("* FATAL: cannot create tar archive", 3, ext_id)
        tar_exception = e
        try:
            write_text(tardir, date, ext_id + ".tar.create.exception",
@ -487,12 +485,10 @@ def update_extension(archivedir, forums, ext_id):
            pass

    try:
-        logging.info(8 * " " + "* Updating db...")
-        db_path = db_file(archivedir, ext_id)
-        update_sqlite_incremental(db_path, tmptardir, ext_id, date)
+        update_db_incremental(tmptardir, ext_id, date)
        sql_success = True
    except Exception as e:
-        logging.exception(8 * " " + "* Exception during update of sqlite db")
+        log_exception("* Exception during update of db", 3, ext_id)
        sql_exception = e

        try:
@ -503,7 +499,7 @@ def update_extension(archivedir, forums, ext_id):
    try:
        shutil.rmtree(path=tmpdir)
    except Exception as e:
-        logging.exception(8 * " " + "* FATAL: cannot remove archive directory")
+        log_exception("* FATAL: cannot remove archive directory", 3, ext_id)
        tar_exception = e
        try:
            write_text(tardir, date, ext_id + ".dir.remove.exception",
@ -511,8 +507,11 @@ def update_extension(archivedir, forums, ext_id):
        except Exception:
            pass

-        logging.info(8 * " " + "* Duration: {}".format(
-            datetime.timedelta(seconds=int(time.time() - start))))
+    log_info(
+        "* Duration: {}".format(
+            datetime.timedelta(seconds=int(time.time() - start))),
+        2,
+        ext_id)
    return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
                        res_reviews, res_support, sql_exception, sql_success)

@ -522,22 +521,20 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids):
    ext_without_forums = []
    ext_ids = list(set(ext_ids) - set(forums_ext_ids))
    forums_ext_ids = list(set(forums_ext_ids))
-    logging.info("Updating {} extensions ({} including forums)".format(
+    log_info("Updating {} extensions ({} including forums)".format(
        len(ext_ids), len(forums_ext_ids)))
    # First, update extensions with forums sequentially (and with delays) to
    # avoid running into Googles DDOS detection.
-    logging.info(2 * " " +
-                 "Updating {} extensions including forums (sequentially))".
-                 format(len(forums_ext_ids)))
+    log_info("Updating {} extensions including forums (sequentially))".format(
+        len(forums_ext_ids)), 1)

    ext_with_forums = list(
        map(partial(update_extension, archivedir, True), forums_ext_ids))

    # Second, update extensions without forums parallel to increase speed.
    parallel_ids = list(set(ext_ids) - set(forums_ext_ids))
-    logging.info(2 * " " +
-                 "Updating {} extensions excluding forums (parallel))".format(
-                     len(parallel_ids)))
+    log_info("Updating {} extensions excluding forums (parallel))".format(
+        len(parallel_ids)), 1)
    with Pool(parallel) as p:
        ext_without_forums = list(
            p.map(partial(update_extension, archivedir, False), parallel_ids))
--- a/ExtensionCrawler/config.py
+++ b/ExtensionCrawler/config.py
@ -125,15 +125,6 @@ def archive_file(archivedir, ext_id):
        str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")


-def db_file(archivedir, ext_id):
-    """DB (sqlite) file of an extension."""
-    return os.path.join(archivedir,
-                        get_local_archive_dir(ext_id), ext_id + ".sqlite")
-
-def jsloc_timeout():
-    """Maximum number of seconds for counting jsloc per extension."""
-    return 600
-
 def const_basedir():
    """Top-level directory for the extension crawler archive."""
    return "archive"
@ -153,14 +144,11 @@ def const_discover():
    """Default configuration of discovery mode"""
    return False

-def const_use_mysql():
-    return False
-
 def const_mysql_config_file():
    return "~/.my.cnf"

 def const_mysql_maxtries():
-    return 3
+    return 30

 def const_mysql_try_wait():
-    return 2
+    return 10
--- a/ExtensionCrawler/sqlite.py
+++ b/ExtensionCrawler/sqlite.py
@ -21,7 +21,6 @@ from ExtensionCrawler.crx import *
 from ExtensionCrawler.archive import *
 from ExtensionCrawler.js_decomposer import decompose_js, DetectionType, FileClassification

-from ExtensionCrawler.dbbackend.sqlite_backend import SqliteBackend
 from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend

 import re
@ -31,7 +30,7 @@ import json
 import os
 import glob
 import datetime
-import logging
+import hashlib


 def get_etag(ext_id, datepath, con):
@ -53,8 +52,8 @@ def get_etag(ext_id, datepath, con):
                if "ETag" in headers:
                    return headers["ETag"]
            except Exception:
-                logging.warning(16 * " " +
-                                "* WARNING: could not parse crx header file")
+                log_warning("* WARNING: could not parse crx header file", 3,
+                            ext_id)

    # Trying to look up previous etag in database
    linkpath = next(
@ -64,8 +63,7 @@ def get_etag(ext_id, datepath, con):
            link = f.read()
            linked_date = link[3:].split("/")[0]

-            result = con.get_most_recent_etag(ext_id,
-                                              con.convert_date(linked_date))
+            result = con.get_etag(ext_id, con.convert_date(linked_date))
            if result is not None:
                return result

@ -100,7 +98,7 @@ def get_crx_status(datepath):


 def parse_and_insert_overview(ext_id, date, datepath, con):
-    logging.info(16 * " " + "- parsing overview file")
+    log_debug("- parsing overview file", 3, ext_id)
    overview_path = os.path.join(datepath, "overview.html")
    if os.path.exists(overview_path):
        with open(overview_path) as overview_file:
@ -187,13 +185,14 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
                        "category",
                        extid=ext_id,
                        date=con.convert_date(date),
+                        category_md5=hashlib.md5(category.encode()).digest(),
                        category=category)


 def parse_and_insert_crx(ext_id, date, datepath, con):
-    logging.info(16 * " " + "- parsing crx file")
    crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
    if crx_path:
+        log_debug("- parsing crx file", 3, ext_id)
        filename = os.path.basename(crx_path)

        with ZipFile(crx_path) as f:
@ -235,6 +234,8 @@ def parse_and_insert_crx(ext_id, date, datepath, con):
                        con.insert(
                            "permission",
                            crx_etag=etag,
+                            permission_md5=hashlib.md5(
+                                str(permission).encode()).digest(),
                            permission=str(permission))
                if "content_scripts" in manifest:
                    for csd in manifest["content_scripts"]:
@ -243,21 +244,26 @@ def parse_and_insert_crx(ext_id, date, datepath, con):
                                con.insert(
                                    "content_script_url",
                                    crx_etag=etag,
+                                    url_md5=hashlib.md5(
+                                        str(urlpattern).encode()).digest(),
                                    url=str(urlpattern))

            js_files = decompose_js(f)
            for js_file_info in js_files:
-                # TODO: Add: evidenceStartPos, evidenceEndPos, and EvidenceText, sha1
-                # TODO: md5, sha1, size, path, type, detect_method, crx_etag, filename should be non-null
                con.insert(
                    "jsfile",
                    crx_etag=etag,
                    detect_method=(js_file_info['detectionMethod']).value,
+                    evidence_start_pos=str(js_file_info['evidenceStartPos']),
+                    evidence_end_pos=str(js_file_info['evidenceEndPos']),
+                    evidence_text=str(js_file_info['evidenceText']),
                    filename=js_file_info['jsFilename'],
                    type=(js_file_info['type']).value,
                    lib=js_file_info['lib'],
                    path=js_file_info['path'],
+                    encoding=js_file_info['encoding'],
                    md5=js_file_info['md5'],
+                    sha1=js_file_info['sha1'],
                    size=js_file_info['size'],
                    version=js_file_info['version'])

@ -268,7 +274,7 @@ def get(d, k):


 def parse_and_insert_review(ext_id, date, reviewpath, con):
-    logging.info(16 * " " + "- parsing review file")
+    log_debug("- parsing review file", 3, ext_id)
    with open(reviewpath) as f:
        content = f.read()
        stripped = content[content.find('{"'):]
@ -304,7 +310,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con):


 def parse_and_insert_support(ext_id, date, supportpath, con):
-    logging.info(16 * " " + "- parsing support file")
+    log_debug("- parsing support file", 3, ext_id)
    with open(supportpath) as f:
        content = f.read()
        stripped = content[content.find('{"'):]
@ -340,12 +346,13 @@ def parse_and_insert_support(ext_id, date, supportpath, con):


 def parse_and_insert_replies(ext_id, date, repliespath, con):
-    logging.info(16 * " " + "- parsing reply file")
+    log_debug("- parsing reply file", 3, ext_id)
    with open(repliespath) as f:
        d = json.load(f)
        if not "searchResults" in d:
-            logging.warning("* WARNING: there are no search results in {}".
-                            format(repliespath))
+            log_warning("* WARNING: there are no search results in {}".format(
+                repliespath), 3, ext_id)
+            return
        results = []
        for result in d["searchResults"]:
            if "annotations" not in result:
@ -380,7 +387,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):


 def parse_and_insert_status(ext_id, date, datepath, con):
-    logging.info(16 * " " + "- parsing status file")
+    log_debug("- parsing status file", 3, ext_id)
    overview_status = get_overview_status(datepath)
    crx_status = get_crx_status(datepath)

@ -399,31 +406,26 @@ def parse_and_insert_status(ext_id, date, datepath, con):
        overview_exception=overview_exception)


-def update_sqlite_incremental(db_path, tmptardir, ext_id, date):
-    logging.info(12 * " " + "- parsing data from {}".format(date))
+def update_db_incremental(tmptardir, ext_id, date):
+    log_info("* Updating db with data from from {}".format(date), 2, ext_id)
    datepath = os.path.join(tmptardir, date)

-    if const_use_mysql():
-        # Don't forget to create a ~/.my.cnf file with the credentials
-        backend = MysqlBackend(read_default_file=const_mysql_config_file())
-    else:
-        backend = SqliteBackend(db_path)
-
-    with backend as con:
+    # Don't forget to create a ~/.my.cnf file with the credentials
+    with MysqlBackend(
+            ext_id, read_default_file=const_mysql_config_file()) as con:
        etag = get_etag(ext_id, datepath, con)

        if etag:
            try:
                parse_and_insert_crx(ext_id, date, datepath, con)
            except zipfile.BadZipfile as e:
-                logging.warning(
-                    16 * " " +
+                log_warning(
                    "* WARNING: the found crx file is not a zip file, exception: {}".
-                    format(str(e)))
+                    format(str(e)), 3, ext_id)
        else:
            crx_status = get_crx_status(datepath)
            if crx_status != 401 and crx_status != 204 and crx_status != 404:
-                logging.warning(16 * " " + "* WARNING: could not find etag")
+                log_warning("* WARNING: could not find etag", 3, ext_id)

        parse_and_insert_overview(ext_id, date, datepath, con)
        parse_and_insert_status(ext_id, date, datepath, con)
@ -433,24 +435,24 @@ def update_sqlite_incremental(db_path, tmptardir, ext_id, date):
            try:
                parse_and_insert_review(ext_id, date, reviewpath, con)
            except json.decoder.JSONDecodeError as e:
-                logging.warning(16 * " " +
-                                "* Could not parse review file, exception: {}".
-                                format(str(e)))
+                log_warning(
+                    "* Could not parse review file, exception: {}".format(
+                        str(e)), 3, ext_id)

        supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
        for supportpath in supportpaths:
            try:
                parse_and_insert_support(ext_id, date, supportpath, con)
            except json.decoder.JSONDecodeError as e:
-                logging.warning(
-                    16 * " " + "* Could not parse support file, exception: {}".
-                    format(str(e)))
+                log_warning(
+                    "* Could not parse support file, exception: {}".format(
+                        str(e)), 3, ext_id)

        repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
        for repliespath in repliespaths:
            try:
                parse_and_insert_replies(ext_id, date, repliespath, con)
            except json.decoder.JSONDecodeError as e:
-                logging.warning(16 * " " +
-                                "* Could not parse reply file, exception: {}".
-                                format(str(e)))
+                log_warning(
+                    "* Could not parse reply file, exception: {}".format(
+                        str(e)), 3, ext_id)
--- a/ExtensionCrawler/dbbackend/mysql_backend.py
+++ b/ExtensionCrawler/dbbackend/mysql_backend.py
@ -20,6 +20,7 @@ import MySQLdb
 import _mysql_exceptions
 import atexit
 import time
+from ExtensionCrawler.util import log_info, log_error, log_exception

 db = None

@ -31,38 +32,47 @@ def close_db():

 atexit.register(close_db)

-def retry(f):
-    for t in range(const_mysql_maxtries()):
-        try:
-            return f()
-        except _mysql_exceptions.OperationalError as e:
-            last_exception = e
-        if t + 1 == const_mysql_maxtries():
-            raise last_exception
-        else:
-            time.sleep(const_mysql_try_wait())
-

 class MysqlBackend:
-    def __init__(self, **kwargs):
+    def retry(self, f):
+        for t in range(const_mysql_maxtries()):
+            try:
+                return f()
+            except _mysql_exceptions.OperationalError as e:
+                last_exception = e
+                if t + 1 == const_mysql_maxtries():
+                    log_exception("MySQL connection eventually failed!", 3,
+                                  self.ext_id)
+                    raise last_exception
+                else:
+                    log_exception(
+                        """Exception on mysql connection attempt {} of {}, """
+                        """wating {}s before retrying...""".format(
+                            t + 1,
+                            const_mysql_maxtries(),
+                            const_mysql_try_wait()), 3, self.ext_id)
+                    time.sleep(const_mysql_try_wait())
+
+    def __init__(self, ext_id, **kwargs):
+        self.ext_id = ext_id
        self.dbargs = kwargs

    def __enter__(self):
        global db
        if db is None:
-            db = retry(lambda: MySQLdb.connect(**self.dbargs))
-        self.cursor = retry(lambda: db.cursor())
+            db = self.retry(lambda: MySQLdb.connect(**self.dbargs))
+        self.cursor = self.retry(lambda: db.cursor())

        return self

    def __exit__(self, *args):
-        retry(lambda: db.commit())
-        retry(lambda: self.cursor.close())
+        self.retry(lambda: db.commit())
+        self.retry(lambda: self.cursor.close())

    def get_single_value(self, query, args):
-        retry(lambda: self.cursor.execute(query, args))
+        self.retry(lambda: self.cursor.execute(query, args))

-        result = retry(lambda: self.cursor.fetchone())
+        result = self.retry(lambda: self.cursor.fetchone())
        if result is not None:
            return result[0]
        else:
@ -81,15 +91,14 @@ class MysqlBackend:
            ",".join(len(args[0]) * ["%s"]),
            ",".join(
                ["{c}=VALUES({c})".format(c=c) for c in arglist[0].keys()]))
-        retry(lambda: self.cursor.executemany(query, args))
+        self.retry(lambda: self.cursor.executemany(query, args))

    def insert(self, table, **kwargs):
        self.insertmany(table, [kwargs])

-    def get_most_recent_etag(self, extid, date):
+    def get_etag(self, extid, date):
        return self.get_single_value(
-            """SELECT crx_etag from extension e1 where extid=%s and date<%s and not exists """
-            """(select 1 from extension e2 where e2.extid=e1.extid and e2.date<e1.date)""",
+            """SELECT crx_etag from extension where extid=%s and date=%s""",
            (extid, date))

    def convert_date(self, date):
--- a/ExtensionCrawler/dbbackend/sqlite_backend.py
+++ b/ExtensionCrawler/dbbackend/sqlite_backend.py
@ -1,167 +0,0 @@
-#
-# Copyright (C) 2017 The University of Sheffield, UK
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import os
-import sqlite3
-
-
-def setup_fts_tables(con, name, columns, primary_columns):
-    sqls = [
-        s.format(
-            name=name,
-            columns=", ".join(columns),
-            new_columns=", ".join(["new." + x for x in columns]),
-            primary_columns=", ".join(primary_columns))
-        for s in [
-            """CREATE TABLE {name}({columns}, PRIMARY KEY ({primary_columns}));""",
-            """CREATE VIRTUAL TABLE {name}_fts using fts4(content="{name}", {columns});""",
-            """CREATE TRIGGER {name}_bu BEFORE UPDATE ON {name} BEGIN """
-            """DELETE FROM {name}_fts WHERE docid=old.rowid;"""
-            """END;""",
-            """CREATE TRIGGER {name}_bd BEFORE DELETE ON {name} BEGIN """
-            """DELETE FROM {name}_fts WHERE docid=old.rowid;"""
-            """END;""",
-            """CREATE TRIGGER {name}_au AFTER UPDATE ON {name} BEGIN """
-            """INSERT INTO {name}_fts(docid, {columns}) VALUES(new.rowid, {new_columns});"""
-            """END;""",
-            """CREATE TRIGGER {name}_ai AFTER INSERT ON {name} BEGIN """
-            """INSERT INTO {name}_fts(docid, {columns}) VALUES(new.rowid, {new_columns});"""
-            """END;"""
-        ]
-    ]
-    for sql in sqls:
-        con.execute(sql)
-
-
-def setup_tables(con):
-    setup_fts_tables(con, "support", [
-        "author", "commentdate", "extid", "date", "displayname", "title",
-        "language", "shortauthor", "comment"
-    ], ["author", "commentdate", "extid", "date"])
-
-    setup_fts_tables(con, "review", [
-        "author", "commentdate", "extid", "date", "displayname", "rating",
-        "language", "shortauthor", "comment"
-    ], ["author", "commentdate", "extid", "date"])
-
-    setup_fts_tables(con, "reply", [
-        "author", "commentdate", "extid", "date", "displayname", "replyto",
-        "language", "shortauthor", "comment"
-    ], ["author", "commentdate", "extid", "date"])
-
-    con.execute("""CREATE TABLE category ("""
-                """extid TEXT,"""
-                """date TEXT,"""
-                """category TEXT,"""
-                """PRIMARY KEY (extid, date, category)"""
-                """)""")
-    con.execute("""CREATE TABLE content_script_url ("""
-                """crx_etag TEXT,"""
-                """url TEXT,"""
-                """PRIMARY KEY (crx_etag, url)"""
-                """)""")
-    con.execute("""CREATE TABLE permission ("""
-                """crx_etag TEXT,"""
-                """permission TEXT,"""
-                """PRIMARY KEY (crx_etag, permission)"""
-                """)""")
-    con.execute("""CREATE TABLE crx ("""
-                """crx_etag TEXT PRIMARY KEY,"""
-                """filename TEXT,"""
-                """size INTEGER,"""
-                """publickey BLOB"""
-                """)""")
-    con.execute("""CREATE TABLE jsfile ("""
-                """crx_etag TEXT,"""
-                """detect_method TEXT,"""
-                """filename TEXT,"""
-                """type TEXT,"""
-                """lib TEXT,"""
-                """path TEXT,"""
-                """md5 TEXT,"""
-                """size INTEGER,"""
-                """version TEXT,"""
-                """PRIMARY KEY (crx_etag, path)"""
-                """)""")
-    con.execute("""CREATE TABLE status ("""
-                """extid TEXT,"""
-                """date TEXT,"""
-                """crx_status INTEGER,"""
-                """overview_status INTEGER,"""
-                """overview_exception TEXT,"""
-                """PRIMARY KEY (extid, date)"""
-                """)""")
-    con.execute("""CREATE TABLE extension ("""
-                """extid TEXT,"""
-                """date TEXT,"""
-                """name TEXT,"""
-                """version TEXT,"""
-                """description TEXT,"""
-                """downloads INTEGER,"""
-                """rating REAL,"""
-                """ratingcount INTEGER,"""
-                """fulldescription TEXT,"""
-                """developer TEXT,"""
-                """itemcategory TEXT,"""
-                """crx_etag TEXT,"""
-                """lastupdated TEXT,"""
-                """PRIMARY KEY (extid, date),"""
-                """FOREIGN KEY (crx_etag) REFERENCES crx(crx_etag)"""
-                """)""")
-
-
-class SqliteBackend:
-    def __init__(self, filename):
-        self.filename = filename
-
-    def __enter__(self):
-        new_db = False
-        if not os.path.exists(self.filename):
-            new_db = True
-        self.con = sqlite3.connect(self.filename)
-        if new_db:
-            setup_tables(self.con)
-        return self
-
-    def __exit__(self, *args):
-        self.con.commit()
-        self.con.close()
-
-    def get_single_value(self, query, args):
-        result = next(self.con.execute(query, args), None)
-        if result is not None:
-            return result[0]
-        else:
-            return None
-
-    def insert(self, table, **kwargs):
-        args = tuple(kwargs.values())
-        self.con.execute("INSERT OR REPLACE INTO {}({}) VALUES ({})".format(
-            table, ",".join(kwargs.keys()), ",".join(len(args) * ["?"])), args)
-
-    def insertmany(self, table, argslist):
-        for arg in argslist:
-            self.insert(table, **arg)
-
-    def get_most_recent_etag(self, extid, date):
-        return self.get_single_value(
-            """SELECT crx_etag from extension e1 where extid=? and date<? and not exists """
-            """(select 1 from extension e2 where e2.extid=e1.extid and e2.date<e1.date)""",
-            (extid, date))
-
-    def convert_date(self, date):
-        return date
--- a/ExtensionCrawler/discover.py
+++ b/ExtensionCrawler/discover.py
@ -23,7 +23,7 @@ import re
 from functools import reduce
 import requests
 import ExtensionCrawler.config
-import logging
+from ExtensionCrawler.util import log_info, log_exception


 def crawl_nearly_all_of_ext_ids():
@ -45,24 +45,24 @@ def crawl_nearly_all_of_ext_ids():
        # The urls with a language parameter attached return a subset
        # of the ids that get returned by the plain urls, therefore we
        # skip urls with a language parameter
-        filter(is_generic_url, ([elem.text for elem in shard_elems])))
+        filter(is_generic_url, ([elem.text for elem in shard_elems])))[:1]
    shards = list(map(lambda u: requests.get(u, timeout=10).text, shard_urls))

    overview_urls = reduce(
        lambda x, y: x + y,
        map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
-    return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
+    return [re.search("[a-z]{32}", url).group(0) for url in overview_urls][:10]


 def get_new_ids(known_ids):
    """Discover new extension ids."""
-    logging.info("Discovering new ids ...")
+    log_info("Discovering new ids ...")
    discovered_ids = []
    try:
        discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
    except Exception:
-        logging.exception("Exception when discovering new ids")
+        log_exception("Exception when discovering new ids")
    new_ids = list(set(discovered_ids) - set(known_ids))
-    logging.info(2 * " " + "Discovered {} new extensions (out of {})".format(
-        len(new_ids), len(discovered_ids)))
+    log_info("Discovered {} new extensions (out of {})".format(
+        len(new_ids), len(discovered_ids)), 1)
    return new_ids
--- a/ExtensionCrawler/js_decomposer.py
+++ b/ExtensionCrawler/js_decomposer.py
@ -111,9 +111,9 @@ def init_jsinfo(zipfile, js_file):
        'evidenceText': None,
        'encoding': chardet.detect(data)['encoding'],
        'jsFilename': js_filename,
-        'md5': hashlib.md5(data).hexdigest(),
-        'sha1': hashlib.sha1(data).hexdigest(),
-        'size': file_size, 
+        'md5': hashlib.md5(data).digest(),
+        'sha1': hashlib.sha1(data).digest(),
+        'size': file_size,
        'path': path
    }
    if js_info['size'] == 0:
@ -218,7 +218,7 @@ def analyse_comment_generic_libs(zipfile, js_file, js_info, comment):
            filename = js_file.filename
    else:
        filename = js_file
-    
+
    for unkregex in unknown_lib_identifiers():
        unkown_lib_matched = unkregex.finditer(comment.content)
        for match in unkown_lib_matched:
@ -272,7 +272,7 @@ def decompose_js(file):
    if isinstance(file, str):
        js_files = [file]
    else:
-        zipfile = file 
+        zipfile = file
        js_files = list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist()))

    for js_file in js_files:
--- a/ExtensionCrawler/util.py
+++ b/ExtensionCrawler/util.py
@ -20,6 +20,8 @@

 from time import sleep
 from random import random
+import traceback
+import logging

 def google_dos_protection(maxrange=0.3):
    """Wait a random number of seconds (between 0.5 to 0.5+maxrange)
@ -32,3 +34,20 @@ def value_of(value, default):
        return value
    else:
        return default
+
+def log_debug(msg, indent_level=0, extid="-" * 32):
+    logging.debug(str(extid) + " " + 4 * indent_level * " " + str(msg))
+
+def log_info(msg, indent_level=0, extid="-" * 32):
+    logging.info(str(extid) + " " + 4 * indent_level * " " + str(msg))
+
+def log_warning(msg, indent_level=0, extid="-" * 32):
+    logging.warning(str(extid) + " " + 4 * indent_level * " " + str(msg))
+
+def log_error(msg, indent_level=0, extid="-" * 32):
+    logging.error(str(extid) + " " + 4 * indent_level * " " + str(msg))
+
+def log_exception(msg, indent_level=0, extid="-" * 32):
+    logging.error(str(extid) + " " + 4 * indent_level * " " + str(msg))
+    for line in traceback.format_exc().splitlines():
+        logging.error(str(extid) + " " + 4 * indent_level * " " + line)
--- a/README.md
+++ b/README.md
@ -9,10 +9,13 @@ extension from the Chrome Web store.
  will check the integrity of the extension.
 * `crx-extract`: A simple tool for extracting `*.crx` files from the
   tar-based archive hierarchy.
-* `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a 
+* `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a
                   JavaScript decomposition analysis.
-* `create-db`: A tool for creating/initializing the database files
-  from already existing extension archives.
+* `crx-jsstrings`: A tool for extracting code blocks, comment blocks, and
+*                string literals from JavaScript.
+* `create-db`: A tool for updating a remote MariaDB from already
+   existing extension archives.
+>>>>>>> bde59c50401448e346032219ab2130a31b3d7ec8

 The utilities store the extensions in the following directory
 hierarchy:
@ -31,9 +34,12 @@ The crawler downloads the most recent extension (i.e., the `*.crx`
 file as well as the overview page. In addition, the `conf` directory
 may contain one file, called `forums.conf` that lists the ids of
 extensions for which the forums and support pages should be downloaded
-as well.  The `data` directory will contain the downloaded extensions
-as well as sqlite files containing the extracted meta data. The sqlite
-files can easily be re-generated using the `create-db` tool.
+as well. The `data` directory will contain the downloaded extensions.
+
+The `crawler` and `create-db` scripts will access and update a MariaDB.
+They will use the host, datebase, and credentials found in `~/.my.cnf`.
+Since they make use of various JSON features, it is recommended to use at
+least version 10.2.8 of MariaDB.

 All utilities are written in Python 3.x. The required modules are listed
 in the file `requirements.txt`.
--- a/52
+++ b/52
@ -24,12 +24,12 @@ import sys
 import datetime
 import time
 import getopt
-import sqlite3
 import logging
 from functools import reduce
 from ExtensionCrawler.discover import get_new_ids
 from ExtensionCrawler.archive import get_forum_ext_ids, get_existing_ids, update_extensions
 from ExtensionCrawler.config import *
+from ExtensionCrawler.util import log_info


 def write_log(dirname, fname, text):
@ -98,39 +98,39 @@ def log_summary(res, runtime=0):

    corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))

-    logging.info("Summary:")
-    logging.info("    Updated {} out of {} extensions successfully".format(
+    log_info("Summary:")
+    log_info("    Updated {} out of {} extensions successfully".format(
        str(len(list(filter(lambda x: x.is_ok(), res)))), str(len(res))))
-    logging.info("    Updated extensions:      {:8d}".format(
+    log_info("    Updated extensions:      {:8d}".format(
        len(list(filter(lambda x: x.is_ok() and not x.not_modified(), res)))))
-    logging.info("    Updated SQL databases:   {:8d}".format(
+    log_info("    Updated SQL databases:   {:8d}".format(
        len(list(filter(lambda x: x.sql_success(), res)))))
-    logging.info("    New extensions:          {:8d}".format(
+    log_info("    New extensions:          {:8d}".format(
        len(list(filter(lambda x: x.is_new(), res)))))
-    logging.info("    Not authorized:          {:8d}".format(
+    log_info("    Not authorized:          {:8d}".format(
        len(list(filter(lambda x: x.not_authorized(), res)))))
-    logging.info("    Raised Google DDOS:      {:8d}".format(
+    log_info("    Raised Google DDOS:      {:8d}".format(
        len(list(filter(lambda x: x.raised_google_ddos(), res)))))
-    logging.info("    Not modified archives:   {:8d}".format(
+    log_info("    Not modified archives:   {:8d}".format(
        len(list(filter(lambda x: x.not_modified(), res)))))
-    logging.info("    Extensions not in store: {:8d}".format(
+    log_info("    Extensions not in store: {:8d}".format(
        len(list(filter(lambda x: x.not_in_store(), res)))))
-    logging.info("    Unknown exception:       {:8d}".format(
+    log_info("    Unknown exception:       {:8d}".format(
        len(list(filter(lambda x: x.has_exception(), res)))))
-    logging.info(
+    log_info(
        "    Corrupt tar archives:    {:8d}".format(len(corrupt_tar_archives)))
-    logging.info("    SQL exception:           {:8d}".format(
+    log_info("    SQL exception:           {:8d}".format(
        len(list(filter(lambda x: x.sql_exception(), res)))))
-    logging.info("    Total runtime:            {}".format(
+    log_info("    Total runtime:            {}".format(
        str(datetime.timedelta(seconds=int(runtime)))))

    if corrupt_tar_archives != []:
-        logging.info("")
-        logging.info("List of extensions with corrupted files/archives:")
+        log_info("")
+        log_info("List of extensions with corrupted files/archives:")
        list(
-            map(lambda x: logging.info("    " + x.id + ": " + str(x.exception), corrupt_tar_archives)
+            map(lambda x: log_info("    " + x.id + ": " + str(x.exception), corrupt_tar_archives)
                ))
-        logging.info("")
+        log_info("")


 def helpmsg():
@ -144,14 +144,12 @@ def helpmsg():

 def print_config(basedir, archive_dir, conf_dir, discover, parallel):
    """Print current configuration."""
-    logging.info("Configuration:")
-    logging.info("  Base dir:                         {}".format(basedir))
-    logging.info("    Archive directory:              {}".format(archive_dir))
-    logging.info("    Configuration directory:        {}".format(conf_dir))
-    logging.info("  Discover new extensions:          {}".format(discover))
-    logging.info("  Max num. of concurrent downloads: {}".format(parallel))
-    logging.info("  SQLite 3 version:                 {}".format(
-        sqlite3.sqlite_version))
+    log_info("Configuration:")
+    log_info("  Base dir:                         {}".format(basedir))
+    log_info("    Archive directory:              {}".format(archive_dir))
+    log_info("    Configuration directory:        {}".format(conf_dir))
+    log_info("  Discover new extensions:          {}".format(discover))
+    log_info("  Max num. of concurrent downloads: {}".format(parallel))


 def parse_args(argv):
@ -220,7 +218,7 @@ def main(argv):
    # they are often temporary
    has_exception = list(filter(lambda x: x.has_exception(), res))
    if has_exception != []:
-        logging.info(
+        log_info(
            "  {} extensions with unknown exceptions, start another try ...".
            format(str(len(has_exception))))
        has_exception_ids = list(map(lambda x: x.id, has_exception))
--- a/41
+++ b/41
@ -24,16 +24,16 @@ import time
 import tempfile
 import fnmatch
 from multiprocessing import Pool
-from functools import partial
 import logging
+import datetime

-from ExtensionCrawler.archive import update_sqlite_incremental
+from ExtensionCrawler.archive import update_db_incremental
 from ExtensionCrawler.config import *
+from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception


 def help():
-    print("create-db [OPTION] DBBASEDIR")
-    print("    DBBASEDIR           directory for generated db files")
+    print("create-db [OPTION]")
    print("    -h                  print this help text")
    print("    -a <DIR>            archive directory")
    print("    -p <PREFIX>         three-letter-prefix")
@ -43,26 +43,28 @@ def help():
    print("    -N <MAXTASKID>      ")


-def process_id(dbbasedir, path):
+def process_id(path):
    start = time.time()
    with tempfile.TemporaryDirectory() as tmpdir:
        with tarfile.open(path) as t:
            t.extractall(tmpdir)

            extid = os.listdir(tmpdir)[0]
-            logging.info("Processing {}".format(extid))
-            dbpath = os.path.join(dbbasedir, extid + ".sqlite")
-            if os.path.exists(dbpath):
-                os.remove(dbpath)
+            log_info("Start processing extension", 0, extid)
            iddir = os.path.join(tmpdir, extid)

            for date in sorted(os.listdir(iddir)):
                try:
-                    update_sqlite_incremental(dbpath, iddir, extid, date)
+                    update_db_incremental(iddir, extid, date)
                except Exception:
-                    logging.exception("Exception when handling {} on {}".
-                                   format(extid, date))
-    logging.info("Finished   {} in {}s".format(extid, time.time() - start))
+                    log_exception(
+                        "Exception when handling data from {}".format(date), 0,
+                        extid)
+    log_info(
+        "Finished extension in {}".format(
+            str(datetime.timedelta(seconds=int(time.time() - start)))),
+        0,
+        extid)


 def find(archive, pattern):
@ -116,13 +118,6 @@ def parse_args(argv):
        elif opt in ("-N", "--maxtaskid"):
            maxtaskid = int(arg)

-    if len(args) < 1:
-        help()
-        sys.exit(2)
-
-    dbbasedir = args[0]
-
-    paths += args[1:]
    if paths == []:
        paths = list(find(archive, "*"))

@ -132,16 +127,16 @@ def parse_args(argv):
    else:
        paths = paths[(taskid - 1) * chunksize:taskid * chunksize]

-    return dbbasedir, paths, parallel
+    return paths, parallel


 def main(argv):
    logging.basicConfig(level=logging.INFO, format=const_log_format())

-    dbbasedir, paths, parallel = parse_args(argv)
+    paths, parallel = parse_args(argv)

    with Pool(processes=parallel) as p:
-        p.map(partial(process_id, dbbasedir), paths)
+        p.map(process_id, paths)


 if __name__ == "__main__":
--- a/scripts/generate_small_db.sh
+++ b/scripts/generate_small_db.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-
-# Usage:
-# ./generate_small_db.sh [BASEDIR] [DBPATH] [EXTENSIONCRAWLER]
-#   [BASEDIR]           path to extension data, needs to contain aaa, aab, etc (defaults to sharc path)
-#   [DBPATH]            path to output db (defaults to ~/aa-ac.sqlite)
-#   [EXTENSIONCRAWLER]  path to git repo (defaults to ~/ExtensionCrawler)
-
-BASEDIR=${1:-/shared/brucker_research1/Shared/BrowserExtensions/data}
-DBPATH=${2:-~/aa-ac.sqlite}
-EXTENSIONCRAWLER=${3:-~/ExtensionCrawler}
-
-find "$BASEDIR" -mindepth 1 -maxdepth 1 -name "a[a-c]*" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs" "{}" "$DBPATH" \;
--- a/scripts/global_update.sh
+++ b/scripts/global_update.sh
@ -1,13 +1,8 @@
 #!/bin/bash
 # m h  dom mon dow   command
 # 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
-# 33 01 * * * ~/ExtensionCrawler/scripts/global_update_db.sh
 # 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh

-
-export LD_LIBRARY_PATH=$HOME/local/lib:/usr/local/lib:$LD_LIBRARY_PATH
-export PATH=$HOME/local/bin:/usr/local/bin:$PATH
-
 ARCHIVE=${1:-/srv/Shared/BrowserExtensions/}
 CRAWLERHOME=${2:-~/ExtensionCrawler}
 LOGPREFIX=$ARCHIVE/log/`date --iso-8601=ns`
@ -15,11 +10,8 @@ LOG=$LOGPREFIX-global.log

 date +'* Start Updating Extensions Archive (%c)' | tee $LOG

-SQLITE=`which sqlite3`
-
 # Update extensions
 (cd $CRAWLERHOME; (./crawler -d -a $ARCHIVE > $LOGPREFIX.log))

-
 date +'* Update Finished (%c)' | tee -a $LOG

--- a/scripts/global_update_db.sh
+++ b/scripts/global_update_db.sh
@ -1,90 +0,0 @@
-#!/bin/bash
-# m h  dom mon dow   command
-# 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
-# 33 01 * * * ~/ExtensionCrawler/scripts/global_update_db.sh
-# 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh
-
-
-export LD_LIBRARY_PATH=$HOME/local/lib:/usr/local/lib:$LD_LIBRARY_PATH
-export PATH=$HOME/local/bin:/usr/local/bin:$PATH
-
-ARCHIVE=${1:-/srv/Shared/BrowserExtensions/}
-CRAWLERHOME=${2:-~/ExtensionCrawler}
-LOGPREFIX=$ARCHIVE/log/`date --iso-8601=ns`
-LOG=$LOGPREFIX-global-db.log 
-
-DBARCHIVE=`find $ARCHIVE/.snapshot -maxdepth 1 -mindepth 1 -name "D*" | head -n 1`
-
-SQLITE=`which sqlite3`
-
-date +"* Start Creating aa-ac.build.sqlite Data Base (%c) using $SQLITE (data: $DBARCHIVE/data)" | tee -a $LOG
-# Update small database
-rm -f $ARCHIVE/db/aa-ac.build.*
-
-(cd $CRAWLERHOME; (./scripts/generate_small_db.sh $DBARCHIVE/data $ARCHIVE/db/aa-ac.build.sqlite &> $LOGPREFIX-sqlite-aa-ac.log))
-if [ $? -ne "0" ]; then 
-  echo "    Creation of aa-ac.build.sqlite failed - see log file for details" | tee -a $LOG
-else 
-  SIZE=`du -k $ARCHIVE/db/aa-ac.build.sqlite | cut -f1`
-  echo "    Created aa-ac.build.sqlite successfully ($SIZE kb)" | tee -a $LOG
-fi
-
-if [ -f "$ARCHIVE"/db/aa-ac.build.sqlite ]; then 
-  date +'* Start Compressing aa-ac.sqlite Data Base (%c)' | tee -a $LOG
-  pbzip2 -f "$ARCHIVE"/db/aa-ac.build.sqlite 
-  if [ $? -ne "0" ]; then 
-      echo "    Creation of aa-ac.sqlite.build.bz2 failed"  | tee -a $LOG
-      rm -f $ARCHIVE/db/aa-ac.build.*
-  else      
-      rm -f $ARCHIVE/db/aa-ac.sqlite.bz2
-      mv $ARCHIVE/db/aa-ac.build.sqlite.bz2 $ARCHIVE/db/aa-ac.sqlite.bz2
-      SIZE=`du -k $ARCHIVE/db/aa-ac.sqlite.bz2 | cut -f1`
-      echo "    Created aa-ac.sqlite.bz2 successfully ($SIZE kb)" | tee -a $LOG
-  fi
-fi
-
-date +"* Start Creating full.sqlite Data Base (%c) using $SQLITE (data: $DBARCHIVE/data)" | tee -a $LOG
-# Update full database
-rm -f $ARCHIVE/db/full.build.*
-"$CRAWLERHOME/scripts/merge_dbs" "$DBARCHIVE/data" "$ARCHIVE/db/full.build.sqlite" &> $LOGPREFIX-sqlite-full.log
-if [ $? -ne "0" ]; then 
-  echo "    Creation of full.build.sqlite failed - see log file for details" | tee -a $LOG
-else 
-  SIZE=`du -k $ARCHIVE/db/full.build.sqlite | cut -f1`
-  echo "    Created full.build.sqlite successfully ($SIZE kb)" | tee -a $LOG
-fi
-
-sync -f "$ARCHIVE"
-sync -f "$ARCHIVE"
-sync -f "$ARCHIVE"
-
-if [ -f "$ARCHIVE"/db/full.build.sqlite ]; then 
-  rm -f "$ARCHIVE"/db/full.sqlite
-  cp  "$ARCHIVE"/db/full.build.sqlite "$ARCHIVE"/db/full.sqlite
-  sync -f "$ARCHIVE"
-  sync -f "$ARCHIVE"
-  sync -f "$ARCHIVE"
-  date +'* Start analysis (%c)' | tee -a $LOG
-  queries=`dirname $0`
-  sqlite3 "$ARCHIVE"/db/full.sqlite < $queries/../queries/get_added_permissions.sql | tee -a $LOGPREFIX-analysis.log | /usr/bin/mail -s "Extension Analysis" root
-  date +'*       Analysis finished (%c)' | tee -a $LOG
-  
-  date +'* Start Compressing full.build.sqlite Data Base (%c)' | tee -a $LOG
-  pbzip2 -f "$ARCHIVE"/db/full.build.sqlite 
-  if [ $? -ne "0" ]; then 
-      rm -f $ARCHIVE/db/full.build.*
-      echo "    Creation of full.sqlite.bz2 failed" | tee -a $LOG
-  else 
-      mv $ARCHIVE/db/full.build.sqlite.bz2 $ARCHIVE/db/full.sqlite.bz2
-      SIZE=`du -k $ARCHIVE/db/full.sqlite.bz2 | cut -f1`
-      echo "    Created full.sqlite.bz2 successfully ($SIZE kb)" | tee -a $LOG
-  fi
-fi
-
-# date +'* Start Compressing Log files (%c)'
-# for f in $ARCHIVE/log/*.log; do 
-#   pbzip2 -f $f &> /dev/null 
-# done 
-
-date +'* Update Finished (%c)' | tee -a $LOG
-
--- a/scripts/merge_dbs
+++ b/scripts/merge_dbs
@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-
-import sqlite3
-import sys
-import os
-import fnmatch
-
-MAX_ATTACHED_DBS = 10
-
-
-def merge_and_detach(con, currently_attached, tablenames):
-    for db in currently_attached:
-        for tablename in tablenames:
-            con.execute(
-                "INSERT OR IGNORE INTO {tablename} SELECT * from {dbname}.{tablename};".
-                format(tablename=tablename, dbname=db))
-            con.commit()
-
-    con.commit()
-    for db in currently_attached:
-        con.execute("DETACH DATABASE {}".format(db))
-
-
-def get_tablenames(dbpath):
-    with sqlite3.connect(dbpath) as con:
-        create_strings = con.execute(
-            """select name from sqlite_master where type='table'"""
-            """ and name NOT LIKE '%!_fts%' escape '!';""").fetchall()
-        return [x[0] for x in create_strings]
-
-
-def merge_schema(con, dbpath):
-    con.execute("ATTACH DATABASE ? as schemadb;", (dbpath, ))
-    create_strings = con.execute(
-        """select sql from schemadb.sqlite_master where """
-        """(type='table' AND name NOT LIKE '%!_fts!_%' escape '!') OR """
-        """type='trigger';""")
-
-    for (create_string, ) in create_strings:
-        print(create_string)
-        con.execute(create_string)
-
-    con.execute("DETACH DATABASE schemadb;")
-
-
-def find(pattern, path):
-    for root, dirs, files in os.walk(path):
-        for name in files:
-            if fnmatch.fnmatch(name, pattern):
-                yield os.path.join(root, name)
-
-
-def help():
-    print("Usage: merge_dbs DBSPATH TODB")
-    print("    DBSPATH   the folder containing the *.sqlite files")
-    print("                (searched recursivly)")
-    print("    TODB      the destination sqlite file")
-
-
-def main(argv):
-    if len(argv) != 2:
-        help()
-        sys.exit(1)
-    dbspath, todb = argv[:2]
-
-    print("Using sqlite3 version {}".format(sqlite3.sqlite_version))
-
-    if os.path.isdir(dbspath):
-        sqlitepaths = list(find("*.sqlite", dbspath))
-    else:
-        sqlitepaths = [dbspath]
-
-    firstdb = sqlitepaths[0]
-    tablenames = get_tablenames(firstdb)
-
-    with sqlite3.connect(todb) as con:
-        if con.execute("SELECT COUNT(*) FROM sqlite_master;").fetchone()[
-                0] == 0:
-            print("Merging schema from {}".format(firstdb))
-            merge_schema(con, firstdb)
-        
-        con.execute("PRAGMA default_temp_store=2;")
-        con.execute("PRAGMA cache_size=20000;")
-        con.execute("PRAGMA temp_store=2;")
-
-        currently_attached = []
-        for i, dbpath in enumerate(sqlitepaths):
-            dbname = "db{}".format(i)
-            print("Attaching {}".format(dbpath))
-            con.execute("ATTACH DATABASE ? as ?", (dbpath, dbname))
-            currently_attached += [dbname]
-            if len(currently_attached) % MAX_ATTACHED_DBS == 0 or i + 1 == len(
-                    sqlitepaths):
-                merge_and_detach(con, currently_attached, tablenames)
-                currently_attached = []
-        con.commit()
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
--- a/scripts/merge_dbs.sge
+++ b/scripts/merge_dbs.sge
@ -1,19 +0,0 @@
-#!/bin/bash
-#
-#$ -j yes
-#
-# Example invocation:
-# qsub merge_dbs DBDIR OUTDBPATH EXTENSIONCRAWLERPATH
-#     DBDIR                   base dir for the generated db files
-#     OUTDBPATH               the generated full db
-#     EXTENSIONCRAWLERPATH    ExtensionCrawler git repo
-DBDIR="$1"
-OUTDBPATH="$2"
-EXTENSIONCRAWLERDIR="$3"
-
-export PATH=~/bin:$PATH
-export LD_LIBRARY_PATH=~/lib:${LD_LIBRARY_PATH:-}
-
-set -u
-
-"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$DBDIR" "$OUTDBPATH"
--- a/scripts/merge_each_db.sge
+++ b/scripts/merge_each_db.sge
@ -1,33 +0,0 @@
-#!/bin/bash
-#
-#$ -t 1-256
-#$ -j yes
-#
-# Example invocation:
-# qsub merge_each_db DBDIR OUTDBPATH EXTENSIONCRAWLERPATH
-#     DBDIR                   base dir for the generated db files
-#     OUTDBPATH               path containing the three-letter-dirs
-#     EXTENSIONCRAWLERPATH    ExtensionCrawler git repo
-DBDIR="$1"
-OUTDBPATH="$2"
-EXTENSIONCRAWLERDIR="$3"
-
-module -s load apps/python/conda 2> /dev/null
-source activate mypython35
-
-export PATH=~/bin:$PATH
-export LD_LIBRARY_PATH=~/lib:${LD_LIBRARY_PATH:-}
-
-function task_id_to_letter_256 {
-	ABC=abcdefghijklmnopqrstuvwxyz
-	let "I1 = (($1-1) / 16) % 16"
-	let "I2 = ($1-1) % 16"
-	echo ${ABC:$I1:1}${ABC:$I2:1}
-}
-
-set -u
-
-find "$DBDIR" -name "$(task_id_to_letter_256 $SGE_TASK_ID)*.sqlite" -print0 | while IFS= read -r -d '' file; do
-  DBNAME=$(basename "$file")
-  "$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$file" "$OUTDBPATH/${DBNAME:0:3}/$DBNAME"
-done
--- a/sge/create-db.sh
+++ b/sge/create-db.sh
@ -14,7 +14,7 @@ ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
 ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/out

 echo "Pushing $BASEDIR to sharc.shef.ac.uk:$TARGETDIR/ExtensionCrawler ..."
-rsync -zr "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"
+rsync -zr --exclude "$BASEDIR/archive" "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"

 echo "Starting job ..."
 ssh sharc.shef.ac.uk \