Merge with upstream.

This commit is contained in:
Achim D. Brucker 2017-08-30 17:18:31 +01:00
commit 3c9ccb4da9
18 changed files with 205 additions and 621 deletions

View File

@ -34,14 +34,13 @@ import datetime
import dateutil
import dateutil.parser
import requests
import logging
from ExtensionCrawler.config import (
const_review_payload, const_review_search_url, const_download_url,
get_local_archive_dir, const_overview_url, const_support_url,
const_support_payload, const_review_search_payload, const_review_url)
from ExtensionCrawler.util import google_dos_protection, value_of
from ExtensionCrawler.sqlite import db_file, update_sqlite_incremental
from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_exception
from ExtensionCrawler.db import update_db_incremental
class Error(Exception):
@ -223,11 +222,10 @@ def update_overview(tar, date, ext_id):
res = None
try:
res = requests.get(const_overview_url(ext_id), timeout=10)
logging.info(8 * " " +
"* overview page: {}".format(str(res.status_code)))
log_info("* overview page: {}".format(str(res.status_code)), 2, ext_id)
store_request_text(tar, date, 'overview.html', res)
except Exception as e:
logging.exception("Exception when retrieving overview page")
log_exception("Exception when retrieving overview page", 2, ext_id)
write_text(tar, date, 'overview.html.exception',
traceback.format_exc())
return RequestResult(res, e)
@ -266,8 +264,9 @@ def update_crx(archivedir, tmptardir, ext_id, date):
stream=True,
headers=headers,
timeout=10)
logging.info(8 * " " + "* crx archive (Last: {}): {}".format(
value_of(last_crx_http_date, "n/a"), str(res.status_code)))
log_info("* crx archive (Last: {}): {}".format(
value_of(last_crx_http_date, "n/a"), str(res.status_code)), 2,
ext_id)
extfilename = os.path.basename(res.url)
if re.search('&', extfilename):
extfilename = "default.crx"
@ -278,12 +277,12 @@ def update_crx(archivedir, tmptardir, ext_id, date):
timeout=10,
allow_redirects=True).headers.get('ETag')
write_text(tmptardir, date, extfilename + ".etag", etag)
logging.info(12 * " " +
"- checking etag, last: {}".format(last_crx_etag))
logging.info(12 * " " + " current: {}".format(etag))
log_info("- checking etag, last: {}".format(last_crx_etag), 3,
ext_id)
log_info(" current: {}".format(etag), 3, ext_id)
if (etag is not "") and (etag != last_crx_etag):
logging.info(12 * " " + "- downloading due to different etags")
log_info("- downloading due to different etags", 3, ext_id)
res = requests.get(
const_download_url().format(ext_id),
@ -304,7 +303,7 @@ def update_crx(archivedir, tmptardir, ext_id, date):
write_text(tmptardir, date, extfilename + ".etag",
res.headers.get("ETag"))
except Exception as e:
logging.exception("Exception when updating crx")
log_exception("Exception when updating crx", 3, ext_id)
write_text(tmptardir, date, extfilename + ".exception",
traceback.format_exc())
return RequestResult(res, e)
@ -330,8 +329,8 @@ def update_reviews(tar, date, ext_id):
const_review_url(),
data=const_review_payload(ext_id, "0", "100"),
timeout=10)
logging.info(8 * " " +
"* review page 0-100: {}".format(str(res.status_code)))
log_info("* review page 0-100: {}".format(str(res.status_code)), 2,
ext_id)
store_request_text(tar, date, 'reviews000-099.text', res)
pages += [res.text]
@ -340,8 +339,8 @@ def update_reviews(tar, date, ext_id):
const_review_url(),
data=const_review_payload(ext_id, "100", "100"),
timeout=10)
logging.info(8 * " " + "* review page 100-200: {}".format(
str(res.status_code)))
log_info("* review page 100-200: {}".format(str(res.status_code)), 2,
ext_id)
store_request_text(tar, date, 'reviews100-199.text', res)
pages += [res.text]
@ -354,11 +353,11 @@ def update_reviews(tar, date, ext_id):
const_review_search_url(),
data=const_review_search_payload(ext_id_author_tups),
timeout=10)
logging.info(8 * " " + "* review page replies: {}".format(
str(res.status_code)))
log_info("* review page replies: {}".format(str(res.status_code)),
2, ext_id)
store_request_text(tar, date, 'reviewsreplies.text', res)
except Exception as e:
logging.exception("Exception when updating reviews")
log_exception("Exception when updating reviews", 2, ext_id)
write_text(tar, date, 'reviews.html.exception', traceback.format_exc())
return RequestResult(res, e)
return RequestResult(res)
@ -374,8 +373,8 @@ def update_support(tar, date, ext_id):
const_support_url(),
data=const_support_payload(ext_id, "0", "100"),
timeout=10)
logging.info(8 * " " +
"* support page 0-100: {}".format(str(res.status_code)))
log_info("* support page 0-100: {}".format(str(res.status_code)), 2,
ext_id)
store_request_text(tar, date, 'support000-099.text', res)
pages += [res.text]
@ -384,8 +383,8 @@ def update_support(tar, date, ext_id):
const_support_url(),
data=const_support_payload(ext_id, "100", "100"),
timeout=10)
logging.info(8 * " " +
"* support page 100-200: {}".format(str(res.status_code)))
log_info("* support page 100-200: {}".format(str(res.status_code)), 2,
ext_id)
store_request_text(tar, date, 'support100-199.text', res)
pages += [res.text]
@ -398,19 +397,19 @@ def update_support(tar, date, ext_id):
const_review_search_url(),
data=const_review_search_payload(ext_id_author_tups),
timeout=10)
logging.info(8 * " " + "* support page replies: {}".format(
str(res.status_code)))
log_info("* support page replies: {}".format(str(res.status_code)),
2, ext_id)
store_request_text(tar, date, 'supportreplies.text', res)
except Exception as e:
logging.exception("Exception when updating support pages")
log_exception("Exception when updating support pages", 2, ext_id)
write_text(tar, date, 'support.html.exception', traceback.format_exc())
return RequestResult(res, e)
return RequestResult(res)
def update_extension(archivedir, forums, ext_id):
logging.info(4 * " " + "Updating extension {}{}".format(
ext_id, " (including forums)" if forums else ""))
log_info("Updating extension {}".format(" (including forums)"
if forums else ""), 1, ext_id)
is_new = False
tar_exception = None
sql_exception = None
@ -426,12 +425,12 @@ def update_extension(archivedir, forums, ext_id):
try:
tmpdir = tempfile.mkdtemp()
tmptardir = os.path.join(tmpdir, ext_id)
logging.info(8 * " " + "* tmptardir = {}".format(tmptardir))
log_info("* tmptardir = {}".format(tmptardir), 2, ext_id)
os.makedirs(
os.path.join(archivedir, get_local_archive_dir(ext_id)),
exist_ok=True)
except Exception as e:
logging.exception(8 * " " + "* FATAL: cannot create tmpdir")
log_exception("* FATAL: cannot create tmpdir", 3, ext_id)
tar_exception = e
return UpdateResult(ext_id, is_new, tar_exception, None, None, None,
None, sql_exception, False)
@ -462,8 +461,7 @@ def update_extension(archivedir, forums, ext_id):
if os.path.exists(tar):
shutil.copyfile(tar, tardir + ".bak.tar")
except Exception as e:
logging.exception(8 * " " +
"* FATAL: cannot rename old tar archive")
log_exception("* FATAL: cannot rename old tar archive", 3, ext_id)
tar_exception = e
try:
write_text(tardir, date, ext_id + ".tar.rename.exception",
@ -478,7 +476,7 @@ def update_extension(archivedir, forums, ext_id):
ar.add(tmptardir, arcname=ext_id)
ar.close()
except Exception as e:
logging.exception(8 * " " + "* FATAL: cannot create tar archive")
log_exception("* FATAL: cannot create tar archive", 3, ext_id)
tar_exception = e
try:
write_text(tardir, date, ext_id + ".tar.create.exception",
@ -487,12 +485,10 @@ def update_extension(archivedir, forums, ext_id):
pass
try:
logging.info(8 * " " + "* Updating db...")
db_path = db_file(archivedir, ext_id)
update_sqlite_incremental(db_path, tmptardir, ext_id, date)
update_db_incremental(tmptardir, ext_id, date)
sql_success = True
except Exception as e:
logging.exception(8 * " " + "* Exception during update of sqlite db")
log_exception("* Exception during update of db", 3, ext_id)
sql_exception = e
try:
@ -503,7 +499,7 @@ def update_extension(archivedir, forums, ext_id):
try:
shutil.rmtree(path=tmpdir)
except Exception as e:
logging.exception(8 * " " + "* FATAL: cannot remove archive directory")
log_exception("* FATAL: cannot remove archive directory", 3, ext_id)
tar_exception = e
try:
write_text(tardir, date, ext_id + ".dir.remove.exception",
@ -511,8 +507,11 @@ def update_extension(archivedir, forums, ext_id):
except Exception:
pass
logging.info(8 * " " + "* Duration: {}".format(
datetime.timedelta(seconds=int(time.time() - start))))
log_info(
"* Duration: {}".format(
datetime.timedelta(seconds=int(time.time() - start))),
2,
ext_id)
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
res_reviews, res_support, sql_exception, sql_success)
@ -522,22 +521,20 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids):
ext_without_forums = []
ext_ids = list(set(ext_ids) - set(forums_ext_ids))
forums_ext_ids = list(set(forums_ext_ids))
logging.info("Updating {} extensions ({} including forums)".format(
log_info("Updating {} extensions ({} including forums)".format(
len(ext_ids), len(forums_ext_ids)))
# First, update extensions with forums sequentially (and with delays) to
# avoid running into Googles DDOS detection.
logging.info(2 * " " +
"Updating {} extensions including forums (sequentially))".
format(len(forums_ext_ids)))
log_info("Updating {} extensions including forums (sequentially))".format(
len(forums_ext_ids)), 1)
ext_with_forums = list(
map(partial(update_extension, archivedir, True), forums_ext_ids))
# Second, update extensions without forums parallel to increase speed.
parallel_ids = list(set(ext_ids) - set(forums_ext_ids))
logging.info(2 * " " +
"Updating {} extensions excluding forums (parallel))".format(
len(parallel_ids)))
log_info("Updating {} extensions excluding forums (parallel))".format(
len(parallel_ids)), 1)
with Pool(parallel) as p:
ext_without_forums = list(
p.map(partial(update_extension, archivedir, False), parallel_ids))

View File

@ -125,15 +125,6 @@ def archive_file(archivedir, ext_id):
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
def db_file(archivedir, ext_id):
"""DB (sqlite) file of an extension."""
return os.path.join(archivedir,
get_local_archive_dir(ext_id), ext_id + ".sqlite")
def jsloc_timeout():
"""Maximum number of seconds for counting jsloc per extension."""
return 600
def const_basedir():
"""Top-level directory for the extension crawler archive."""
return "archive"
@ -153,14 +144,11 @@ def const_discover():
"""Default configuration of discovery mode"""
return False
def const_use_mysql():
return False
def const_mysql_config_file():
return "~/.my.cnf"
def const_mysql_maxtries():
return 3
return 30
def const_mysql_try_wait():
return 2
return 10

View File

@ -21,7 +21,6 @@ from ExtensionCrawler.crx import *
from ExtensionCrawler.archive import *
from ExtensionCrawler.js_decomposer import decompose_js, DetectionType, FileClassification
from ExtensionCrawler.dbbackend.sqlite_backend import SqliteBackend
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
import re
@ -31,7 +30,7 @@ import json
import os
import glob
import datetime
import logging
import hashlib
def get_etag(ext_id, datepath, con):
@ -53,8 +52,8 @@ def get_etag(ext_id, datepath, con):
if "ETag" in headers:
return headers["ETag"]
except Exception:
logging.warning(16 * " " +
"* WARNING: could not parse crx header file")
log_warning("* WARNING: could not parse crx header file", 3,
ext_id)
# Trying to look up previous etag in database
linkpath = next(
@ -64,8 +63,7 @@ def get_etag(ext_id, datepath, con):
link = f.read()
linked_date = link[3:].split("/")[0]
result = con.get_most_recent_etag(ext_id,
con.convert_date(linked_date))
result = con.get_etag(ext_id, con.convert_date(linked_date))
if result is not None:
return result
@ -100,7 +98,7 @@ def get_crx_status(datepath):
def parse_and_insert_overview(ext_id, date, datepath, con):
logging.info(16 * " " + "- parsing overview file")
log_debug("- parsing overview file", 3, ext_id)
overview_path = os.path.join(datepath, "overview.html")
if os.path.exists(overview_path):
with open(overview_path) as overview_file:
@ -187,13 +185,14 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
"category",
extid=ext_id,
date=con.convert_date(date),
category_md5=hashlib.md5(category.encode()).digest(),
category=category)
def parse_and_insert_crx(ext_id, date, datepath, con):
logging.info(16 * " " + "- parsing crx file")
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
if crx_path:
log_debug("- parsing crx file", 3, ext_id)
filename = os.path.basename(crx_path)
with ZipFile(crx_path) as f:
@ -235,6 +234,8 @@ def parse_and_insert_crx(ext_id, date, datepath, con):
con.insert(
"permission",
crx_etag=etag,
permission_md5=hashlib.md5(
str(permission).encode()).digest(),
permission=str(permission))
if "content_scripts" in manifest:
for csd in manifest["content_scripts"]:
@ -243,21 +244,26 @@ def parse_and_insert_crx(ext_id, date, datepath, con):
con.insert(
"content_script_url",
crx_etag=etag,
url_md5=hashlib.md5(
str(urlpattern).encode()).digest(),
url=str(urlpattern))
js_files = decompose_js(f)
for js_file_info in js_files:
# TODO: Add: evidenceStartPos, evidenceEndPos, and EvidenceText, sha1
# TODO: md5, sha1, size, path, type, detect_method, crx_etag, filename should be non-null
con.insert(
"jsfile",
crx_etag=etag,
detect_method=(js_file_info['detectionMethod']).value,
evidence_start_pos=str(js_file_info['evidenceStartPos']),
evidence_end_pos=str(js_file_info['evidenceEndPos']),
evidence_text=str(js_file_info['evidenceText']),
filename=js_file_info['jsFilename'],
type=(js_file_info['type']).value,
lib=js_file_info['lib'],
path=js_file_info['path'],
encoding=js_file_info['encoding'],
md5=js_file_info['md5'],
sha1=js_file_info['sha1'],
size=js_file_info['size'],
version=js_file_info['version'])
@ -268,7 +274,7 @@ def get(d, k):
def parse_and_insert_review(ext_id, date, reviewpath, con):
logging.info(16 * " " + "- parsing review file")
log_debug("- parsing review file", 3, ext_id)
with open(reviewpath) as f:
content = f.read()
stripped = content[content.find('{"'):]
@ -304,7 +310,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con):
def parse_and_insert_support(ext_id, date, supportpath, con):
logging.info(16 * " " + "- parsing support file")
log_debug("- parsing support file", 3, ext_id)
with open(supportpath) as f:
content = f.read()
stripped = content[content.find('{"'):]
@ -340,12 +346,13 @@ def parse_and_insert_support(ext_id, date, supportpath, con):
def parse_and_insert_replies(ext_id, date, repliespath, con):
logging.info(16 * " " + "- parsing reply file")
log_debug("- parsing reply file", 3, ext_id)
with open(repliespath) as f:
d = json.load(f)
if not "searchResults" in d:
logging.warning("* WARNING: there are no search results in {}".
format(repliespath))
log_warning("* WARNING: there are no search results in {}".format(
repliespath), 3, ext_id)
return
results = []
for result in d["searchResults"]:
if "annotations" not in result:
@ -380,7 +387,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
def parse_and_insert_status(ext_id, date, datepath, con):
logging.info(16 * " " + "- parsing status file")
log_debug("- parsing status file", 3, ext_id)
overview_status = get_overview_status(datepath)
crx_status = get_crx_status(datepath)
@ -399,31 +406,26 @@ def parse_and_insert_status(ext_id, date, datepath, con):
overview_exception=overview_exception)
def update_sqlite_incremental(db_path, tmptardir, ext_id, date):
logging.info(12 * " " + "- parsing data from {}".format(date))
def update_db_incremental(tmptardir, ext_id, date):
log_info("* Updating db with data from from {}".format(date), 2, ext_id)
datepath = os.path.join(tmptardir, date)
if const_use_mysql():
# Don't forget to create a ~/.my.cnf file with the credentials
backend = MysqlBackend(read_default_file=const_mysql_config_file())
else:
backend = SqliteBackend(db_path)
with backend as con:
# Don't forget to create a ~/.my.cnf file with the credentials
with MysqlBackend(
ext_id, read_default_file=const_mysql_config_file()) as con:
etag = get_etag(ext_id, datepath, con)
if etag:
try:
parse_and_insert_crx(ext_id, date, datepath, con)
except zipfile.BadZipfile as e:
logging.warning(
16 * " " +
log_warning(
"* WARNING: the found crx file is not a zip file, exception: {}".
format(str(e)))
format(str(e)), 3, ext_id)
else:
crx_status = get_crx_status(datepath)
if crx_status != 401 and crx_status != 204 and crx_status != 404:
logging.warning(16 * " " + "* WARNING: could not find etag")
log_warning("* WARNING: could not find etag", 3, ext_id)
parse_and_insert_overview(ext_id, date, datepath, con)
parse_and_insert_status(ext_id, date, datepath, con)
@ -433,24 +435,24 @@ def update_sqlite_incremental(db_path, tmptardir, ext_id, date):
try:
parse_and_insert_review(ext_id, date, reviewpath, con)
except json.decoder.JSONDecodeError as e:
logging.warning(16 * " " +
"* Could not parse review file, exception: {}".
format(str(e)))
log_warning(
"* Could not parse review file, exception: {}".format(
str(e)), 3, ext_id)
supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
for supportpath in supportpaths:
try:
parse_and_insert_support(ext_id, date, supportpath, con)
except json.decoder.JSONDecodeError as e:
logging.warning(
16 * " " + "* Could not parse support file, exception: {}".
format(str(e)))
log_warning(
"* Could not parse support file, exception: {}".format(
str(e)), 3, ext_id)
repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
for repliespath in repliespaths:
try:
parse_and_insert_replies(ext_id, date, repliespath, con)
except json.decoder.JSONDecodeError as e:
logging.warning(16 * " " +
"* Could not parse reply file, exception: {}".
format(str(e)))
log_warning(
"* Could not parse reply file, exception: {}".format(
str(e)), 3, ext_id)

View File

@ -20,6 +20,7 @@ import MySQLdb
import _mysql_exceptions
import atexit
import time
from ExtensionCrawler.util import log_info, log_error, log_exception
db = None
@ -31,38 +32,47 @@ def close_db():
atexit.register(close_db)
def retry(f):
for t in range(const_mysql_maxtries()):
try:
return f()
except _mysql_exceptions.OperationalError as e:
last_exception = e
if t + 1 == const_mysql_maxtries():
raise last_exception
else:
time.sleep(const_mysql_try_wait())
class MysqlBackend:
def __init__(self, **kwargs):
def retry(self, f):
for t in range(const_mysql_maxtries()):
try:
return f()
except _mysql_exceptions.OperationalError as e:
last_exception = e
if t + 1 == const_mysql_maxtries():
log_exception("MySQL connection eventually failed!", 3,
self.ext_id)
raise last_exception
else:
log_exception(
"""Exception on mysql connection attempt {} of {}, """
"""wating {}s before retrying...""".format(
t + 1,
const_mysql_maxtries(),
const_mysql_try_wait()), 3, self.ext_id)
time.sleep(const_mysql_try_wait())
def __init__(self, ext_id, **kwargs):
self.ext_id = ext_id
self.dbargs = kwargs
def __enter__(self):
global db
if db is None:
db = retry(lambda: MySQLdb.connect(**self.dbargs))
self.cursor = retry(lambda: db.cursor())
db = self.retry(lambda: MySQLdb.connect(**self.dbargs))
self.cursor = self.retry(lambda: db.cursor())
return self
def __exit__(self, *args):
retry(lambda: db.commit())
retry(lambda: self.cursor.close())
self.retry(lambda: db.commit())
self.retry(lambda: self.cursor.close())
def get_single_value(self, query, args):
retry(lambda: self.cursor.execute(query, args))
self.retry(lambda: self.cursor.execute(query, args))
result = retry(lambda: self.cursor.fetchone())
result = self.retry(lambda: self.cursor.fetchone())
if result is not None:
return result[0]
else:
@ -81,15 +91,14 @@ class MysqlBackend:
",".join(len(args[0]) * ["%s"]),
",".join(
["{c}=VALUES({c})".format(c=c) for c in arglist[0].keys()]))
retry(lambda: self.cursor.executemany(query, args))
self.retry(lambda: self.cursor.executemany(query, args))
def insert(self, table, **kwargs):
self.insertmany(table, [kwargs])
def get_most_recent_etag(self, extid, date):
def get_etag(self, extid, date):
return self.get_single_value(
"""SELECT crx_etag from extension e1 where extid=%s and date<%s and not exists """
"""(select 1 from extension e2 where e2.extid=e1.extid and e2.date<e1.date)""",
"""SELECT crx_etag from extension where extid=%s and date=%s""",
(extid, date))
def convert_date(self, date):

View File

@ -1,167 +0,0 @@
#
# Copyright (C) 2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sqlite3
def setup_fts_tables(con, name, columns, primary_columns):
sqls = [
s.format(
name=name,
columns=", ".join(columns),
new_columns=", ".join(["new." + x for x in columns]),
primary_columns=", ".join(primary_columns))
for s in [
"""CREATE TABLE {name}({columns}, PRIMARY KEY ({primary_columns}));""",
"""CREATE VIRTUAL TABLE {name}_fts using fts4(content="{name}", {columns});""",
"""CREATE TRIGGER {name}_bu BEFORE UPDATE ON {name} BEGIN """
"""DELETE FROM {name}_fts WHERE docid=old.rowid;"""
"""END;""",
"""CREATE TRIGGER {name}_bd BEFORE DELETE ON {name} BEGIN """
"""DELETE FROM {name}_fts WHERE docid=old.rowid;"""
"""END;""",
"""CREATE TRIGGER {name}_au AFTER UPDATE ON {name} BEGIN """
"""INSERT INTO {name}_fts(docid, {columns}) VALUES(new.rowid, {new_columns});"""
"""END;""",
"""CREATE TRIGGER {name}_ai AFTER INSERT ON {name} BEGIN """
"""INSERT INTO {name}_fts(docid, {columns}) VALUES(new.rowid, {new_columns});"""
"""END;"""
]
]
for sql in sqls:
con.execute(sql)
def setup_tables(con):
setup_fts_tables(con, "support", [
"author", "commentdate", "extid", "date", "displayname", "title",
"language", "shortauthor", "comment"
], ["author", "commentdate", "extid", "date"])
setup_fts_tables(con, "review", [
"author", "commentdate", "extid", "date", "displayname", "rating",
"language", "shortauthor", "comment"
], ["author", "commentdate", "extid", "date"])
setup_fts_tables(con, "reply", [
"author", "commentdate", "extid", "date", "displayname", "replyto",
"language", "shortauthor", "comment"
], ["author", "commentdate", "extid", "date"])
con.execute("""CREATE TABLE category ("""
"""extid TEXT,"""
"""date TEXT,"""
"""category TEXT,"""
"""PRIMARY KEY (extid, date, category)"""
""")""")
con.execute("""CREATE TABLE content_script_url ("""
"""crx_etag TEXT,"""
"""url TEXT,"""
"""PRIMARY KEY (crx_etag, url)"""
""")""")
con.execute("""CREATE TABLE permission ("""
"""crx_etag TEXT,"""
"""permission TEXT,"""
"""PRIMARY KEY (crx_etag, permission)"""
""")""")
con.execute("""CREATE TABLE crx ("""
"""crx_etag TEXT PRIMARY KEY,"""
"""filename TEXT,"""
"""size INTEGER,"""
"""publickey BLOB"""
""")""")
con.execute("""CREATE TABLE jsfile ("""
"""crx_etag TEXT,"""
"""detect_method TEXT,"""
"""filename TEXT,"""
"""type TEXT,"""
"""lib TEXT,"""
"""path TEXT,"""
"""md5 TEXT,"""
"""size INTEGER,"""
"""version TEXT,"""
"""PRIMARY KEY (crx_etag, path)"""
""")""")
con.execute("""CREATE TABLE status ("""
"""extid TEXT,"""
"""date TEXT,"""
"""crx_status INTEGER,"""
"""overview_status INTEGER,"""
"""overview_exception TEXT,"""
"""PRIMARY KEY (extid, date)"""
""")""")
con.execute("""CREATE TABLE extension ("""
"""extid TEXT,"""
"""date TEXT,"""
"""name TEXT,"""
"""version TEXT,"""
"""description TEXT,"""
"""downloads INTEGER,"""
"""rating REAL,"""
"""ratingcount INTEGER,"""
"""fulldescription TEXT,"""
"""developer TEXT,"""
"""itemcategory TEXT,"""
"""crx_etag TEXT,"""
"""lastupdated TEXT,"""
"""PRIMARY KEY (extid, date),"""
"""FOREIGN KEY (crx_etag) REFERENCES crx(crx_etag)"""
""")""")
class SqliteBackend:
def __init__(self, filename):
self.filename = filename
def __enter__(self):
new_db = False
if not os.path.exists(self.filename):
new_db = True
self.con = sqlite3.connect(self.filename)
if new_db:
setup_tables(self.con)
return self
def __exit__(self, *args):
self.con.commit()
self.con.close()
def get_single_value(self, query, args):
result = next(self.con.execute(query, args), None)
if result is not None:
return result[0]
else:
return None
def insert(self, table, **kwargs):
args = tuple(kwargs.values())
self.con.execute("INSERT OR REPLACE INTO {}({}) VALUES ({})".format(
table, ",".join(kwargs.keys()), ",".join(len(args) * ["?"])), args)
def insertmany(self, table, argslist):
for arg in argslist:
self.insert(table, **arg)
def get_most_recent_etag(self, extid, date):
return self.get_single_value(
"""SELECT crx_etag from extension e1 where extid=? and date<? and not exists """
"""(select 1 from extension e2 where e2.extid=e1.extid and e2.date<e1.date)""",
(extid, date))
def convert_date(self, date):
return date

View File

@ -23,7 +23,7 @@ import re
from functools import reduce
import requests
import ExtensionCrawler.config
import logging
from ExtensionCrawler.util import log_info, log_exception
def crawl_nearly_all_of_ext_ids():
@ -45,24 +45,24 @@ def crawl_nearly_all_of_ext_ids():
# The urls with a language parameter attached return a subset
# of the ids that get returned by the plain urls, therefore we
# skip urls with a language parameter
filter(is_generic_url, ([elem.text for elem in shard_elems])))
filter(is_generic_url, ([elem.text for elem in shard_elems])))[:1]
shards = list(map(lambda u: requests.get(u, timeout=10).text, shard_urls))
overview_urls = reduce(
lambda x, y: x + y,
map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls][:10]
def get_new_ids(known_ids):
"""Discover new extension ids."""
logging.info("Discovering new ids ...")
log_info("Discovering new ids ...")
discovered_ids = []
try:
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
except Exception:
logging.exception("Exception when discovering new ids")
log_exception("Exception when discovering new ids")
new_ids = list(set(discovered_ids) - set(known_ids))
logging.info(2 * " " + "Discovered {} new extensions (out of {})".format(
len(new_ids), len(discovered_ids)))
log_info("Discovered {} new extensions (out of {})".format(
len(new_ids), len(discovered_ids)), 1)
return new_ids

View File

@ -111,9 +111,9 @@ def init_jsinfo(zipfile, js_file):
'evidenceText': None,
'encoding': chardet.detect(data)['encoding'],
'jsFilename': js_filename,
'md5': hashlib.md5(data).hexdigest(),
'sha1': hashlib.sha1(data).hexdigest(),
'size': file_size,
'md5': hashlib.md5(data).digest(),
'sha1': hashlib.sha1(data).digest(),
'size': file_size,
'path': path
}
if js_info['size'] == 0:
@ -218,7 +218,7 @@ def analyse_comment_generic_libs(zipfile, js_file, js_info, comment):
filename = js_file.filename
else:
filename = js_file
for unkregex in unknown_lib_identifiers():
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
@ -272,7 +272,7 @@ def decompose_js(file):
if isinstance(file, str):
js_files = [file]
else:
zipfile = file
zipfile = file
js_files = list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist()))
for js_file in js_files:

View File

@ -20,6 +20,8 @@
from time import sleep
from random import random
import traceback
import logging
def google_dos_protection(maxrange=0.3):
"""Wait a random number of seconds (between 0.5 to 0.5+maxrange)
@ -32,3 +34,20 @@ def value_of(value, default):
return value
else:
return default
def log_debug(msg, indent_level=0, extid="-" * 32):
logging.debug(str(extid) + " " + 4 * indent_level * " " + str(msg))
def log_info(msg, indent_level=0, extid="-" * 32):
logging.info(str(extid) + " " + 4 * indent_level * " " + str(msg))
def log_warning(msg, indent_level=0, extid="-" * 32):
logging.warning(str(extid) + " " + 4 * indent_level * " " + str(msg))
def log_error(msg, indent_level=0, extid="-" * 32):
logging.error(str(extid) + " " + 4 * indent_level * " " + str(msg))
def log_exception(msg, indent_level=0, extid="-" * 32):
logging.error(str(extid) + " " + 4 * indent_level * " " + str(msg))
for line in traceback.format_exc().splitlines():
logging.error(str(extid) + " " + 4 * indent_level * " " + line)

View File

@ -9,10 +9,13 @@ extension from the Chrome Web store.
will check the integrity of the extension.
* `crx-extract`: A simple tool for extracting `*.crx` files from the
tar-based archive hierarchy.
* `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a
* `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a
JavaScript decomposition analysis.
* `create-db`: A tool for creating/initializing the database files
from already existing extension archives.
* `crx-jsstrings`: A tool for extracting code blocks, comment blocks, and
* string literals from JavaScript.
* `create-db`: A tool for updating a remote MariaDB from already
existing extension archives.
>>>>>>> bde59c50401448e346032219ab2130a31b3d7ec8
The utilities store the extensions in the following directory
hierarchy:
@ -31,9 +34,12 @@ The crawler downloads the most recent extension (i.e., the `*.crx`
file as well as the overview page. In addition, the `conf` directory
may contain one file, called `forums.conf` that lists the ids of
extensions for which the forums and support pages should be downloaded
as well. The `data` directory will contain the downloaded extensions
as well as sqlite files containing the extracted meta data. The sqlite
files can easily be re-generated using the `create-db` tool.
as well. The `data` directory will contain the downloaded extensions.
The `crawler` and `create-db` scripts will access and update a MariaDB.
They will use the host, datebase, and credentials found in `~/.my.cnf`.
Since they make use of various JSON features, it is recommended to use at
least version 10.2.8 of MariaDB.
All utilities are written in Python 3.x. The required modules are listed
in the file `requirements.txt`.

52
crawler
View File

@ -24,12 +24,12 @@ import sys
import datetime
import time
import getopt
import sqlite3
import logging
from functools import reduce
from ExtensionCrawler.discover import get_new_ids
from ExtensionCrawler.archive import get_forum_ext_ids, get_existing_ids, update_extensions
from ExtensionCrawler.config import *
from ExtensionCrawler.util import log_info
def write_log(dirname, fname, text):
@ -98,39 +98,39 @@ def log_summary(res, runtime=0):
corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
logging.info("Summary:")
logging.info(" Updated {} out of {} extensions successfully".format(
log_info("Summary:")
log_info(" Updated {} out of {} extensions successfully".format(
str(len(list(filter(lambda x: x.is_ok(), res)))), str(len(res))))
logging.info(" Updated extensions: {:8d}".format(
log_info(" Updated extensions: {:8d}".format(
len(list(filter(lambda x: x.is_ok() and not x.not_modified(), res)))))
logging.info(" Updated SQL databases: {:8d}".format(
log_info(" Updated SQL databases: {:8d}".format(
len(list(filter(lambda x: x.sql_success(), res)))))
logging.info(" New extensions: {:8d}".format(
log_info(" New extensions: {:8d}".format(
len(list(filter(lambda x: x.is_new(), res)))))
logging.info(" Not authorized: {:8d}".format(
log_info(" Not authorized: {:8d}".format(
len(list(filter(lambda x: x.not_authorized(), res)))))
logging.info(" Raised Google DDOS: {:8d}".format(
log_info(" Raised Google DDOS: {:8d}".format(
len(list(filter(lambda x: x.raised_google_ddos(), res)))))
logging.info(" Not modified archives: {:8d}".format(
log_info(" Not modified archives: {:8d}".format(
len(list(filter(lambda x: x.not_modified(), res)))))
logging.info(" Extensions not in store: {:8d}".format(
log_info(" Extensions not in store: {:8d}".format(
len(list(filter(lambda x: x.not_in_store(), res)))))
logging.info(" Unknown exception: {:8d}".format(
log_info(" Unknown exception: {:8d}".format(
len(list(filter(lambda x: x.has_exception(), res)))))
logging.info(
log_info(
" Corrupt tar archives: {:8d}".format(len(corrupt_tar_archives)))
logging.info(" SQL exception: {:8d}".format(
log_info(" SQL exception: {:8d}".format(
len(list(filter(lambda x: x.sql_exception(), res)))))
logging.info(" Total runtime: {}".format(
log_info(" Total runtime: {}".format(
str(datetime.timedelta(seconds=int(runtime)))))
if corrupt_tar_archives != []:
logging.info("")
logging.info("List of extensions with corrupted files/archives:")
log_info("")
log_info("List of extensions with corrupted files/archives:")
list(
map(lambda x: logging.info(" " + x.id + ": " + str(x.exception), corrupt_tar_archives)
map(lambda x: log_info(" " + x.id + ": " + str(x.exception), corrupt_tar_archives)
))
logging.info("")
log_info("")
def helpmsg():
@ -144,14 +144,12 @@ def helpmsg():
def print_config(basedir, archive_dir, conf_dir, discover, parallel):
"""Print current configuration."""
logging.info("Configuration:")
logging.info(" Base dir: {}".format(basedir))
logging.info(" Archive directory: {}".format(archive_dir))
logging.info(" Configuration directory: {}".format(conf_dir))
logging.info(" Discover new extensions: {}".format(discover))
logging.info(" Max num. of concurrent downloads: {}".format(parallel))
logging.info(" SQLite 3 version: {}".format(
sqlite3.sqlite_version))
log_info("Configuration:")
log_info(" Base dir: {}".format(basedir))
log_info(" Archive directory: {}".format(archive_dir))
log_info(" Configuration directory: {}".format(conf_dir))
log_info(" Discover new extensions: {}".format(discover))
log_info(" Max num. of concurrent downloads: {}".format(parallel))
def parse_args(argv):
@ -220,7 +218,7 @@ def main(argv):
# they are often temporary
has_exception = list(filter(lambda x: x.has_exception(), res))
if has_exception != []:
logging.info(
log_info(
" {} extensions with unknown exceptions, start another try ...".
format(str(len(has_exception))))
has_exception_ids = list(map(lambda x: x.id, has_exception))

View File

@ -24,16 +24,16 @@ import time
import tempfile
import fnmatch
from multiprocessing import Pool
from functools import partial
import logging
import datetime
from ExtensionCrawler.archive import update_sqlite_incremental
from ExtensionCrawler.archive import update_db_incremental
from ExtensionCrawler.config import *
from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception
def help():
print("create-db [OPTION] DBBASEDIR")
print(" DBBASEDIR directory for generated db files")
print("create-db [OPTION]")
print(" -h print this help text")
print(" -a <DIR> archive directory")
print(" -p <PREFIX> three-letter-prefix")
@ -43,26 +43,28 @@ def help():
print(" -N <MAXTASKID> ")
def process_id(dbbasedir, path):
def process_id(path):
start = time.time()
with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(path) as t:
t.extractall(tmpdir)
extid = os.listdir(tmpdir)[0]
logging.info("Processing {}".format(extid))
dbpath = os.path.join(dbbasedir, extid + ".sqlite")
if os.path.exists(dbpath):
os.remove(dbpath)
log_info("Start processing extension", 0, extid)
iddir = os.path.join(tmpdir, extid)
for date in sorted(os.listdir(iddir)):
try:
update_sqlite_incremental(dbpath, iddir, extid, date)
update_db_incremental(iddir, extid, date)
except Exception:
logging.exception("Exception when handling {} on {}".
format(extid, date))
logging.info("Finished {} in {}s".format(extid, time.time() - start))
log_exception(
"Exception when handling data from {}".format(date), 0,
extid)
log_info(
"Finished extension in {}".format(
str(datetime.timedelta(seconds=int(time.time() - start)))),
0,
extid)
def find(archive, pattern):
@ -116,13 +118,6 @@ def parse_args(argv):
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
if len(args) < 1:
help()
sys.exit(2)
dbbasedir = args[0]
paths += args[1:]
if paths == []:
paths = list(find(archive, "*"))
@ -132,16 +127,16 @@ def parse_args(argv):
else:
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
return dbbasedir, paths, parallel
return paths, parallel
def main(argv):
logging.basicConfig(level=logging.INFO, format=const_log_format())
dbbasedir, paths, parallel = parse_args(argv)
paths, parallel = parse_args(argv)
with Pool(processes=parallel) as p:
p.map(partial(process_id, dbbasedir), paths)
p.map(process_id, paths)
if __name__ == "__main__":

View File

@ -1,13 +0,0 @@
#!/bin/bash
# Usage:
# ./generate_small_db.sh [BASEDIR] [DBPATH] [EXTENSIONCRAWLER]
# [BASEDIR] path to extension data, needs to contain aaa, aab, etc (defaults to sharc path)
# [DBPATH] path to output db (defaults to ~/aa-ac.sqlite)
# [EXTENSIONCRAWLER] path to git repo (defaults to ~/ExtensionCrawler)
BASEDIR=${1:-/shared/brucker_research1/Shared/BrowserExtensions/data}
DBPATH=${2:-~/aa-ac.sqlite}
EXTENSIONCRAWLER=${3:-~/ExtensionCrawler}
find "$BASEDIR" -mindepth 1 -maxdepth 1 -name "a[a-c]*" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs" "{}" "$DBPATH" \;

View File

@ -1,13 +1,8 @@
#!/bin/bash
# m h dom mon dow command
# 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
# 33 01 * * * ~/ExtensionCrawler/scripts/global_update_db.sh
# 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh
export LD_LIBRARY_PATH=$HOME/local/lib:/usr/local/lib:$LD_LIBRARY_PATH
export PATH=$HOME/local/bin:/usr/local/bin:$PATH
ARCHIVE=${1:-/srv/Shared/BrowserExtensions/}
CRAWLERHOME=${2:-~/ExtensionCrawler}
LOGPREFIX=$ARCHIVE/log/`date --iso-8601=ns`
@ -15,11 +10,8 @@ LOG=$LOGPREFIX-global.log
date +'* Start Updating Extensions Archive (%c)' | tee $LOG
SQLITE=`which sqlite3`
# Update extensions
(cd $CRAWLERHOME; (./crawler -d -a $ARCHIVE > $LOGPREFIX.log))
date +'* Update Finished (%c)' | tee -a $LOG

View File

@ -1,90 +0,0 @@
#!/bin/bash
# m h dom mon dow command
# 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
# 33 01 * * * ~/ExtensionCrawler/scripts/global_update_db.sh
# 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh
export LD_LIBRARY_PATH=$HOME/local/lib:/usr/local/lib:$LD_LIBRARY_PATH
export PATH=$HOME/local/bin:/usr/local/bin:$PATH
ARCHIVE=${1:-/srv/Shared/BrowserExtensions/}
CRAWLERHOME=${2:-~/ExtensionCrawler}
LOGPREFIX=$ARCHIVE/log/`date --iso-8601=ns`
LOG=$LOGPREFIX-global-db.log
DBARCHIVE=`find $ARCHIVE/.snapshot -maxdepth 1 -mindepth 1 -name "D*" | head -n 1`
SQLITE=`which sqlite3`
date +"* Start Creating aa-ac.build.sqlite Data Base (%c) using $SQLITE (data: $DBARCHIVE/data)" | tee -a $LOG
# Update small database
rm -f $ARCHIVE/db/aa-ac.build.*
(cd $CRAWLERHOME; (./scripts/generate_small_db.sh $DBARCHIVE/data $ARCHIVE/db/aa-ac.build.sqlite &> $LOGPREFIX-sqlite-aa-ac.log))
if [ $? -ne "0" ]; then
echo " Creation of aa-ac.build.sqlite failed - see log file for details" | tee -a $LOG
else
SIZE=`du -k $ARCHIVE/db/aa-ac.build.sqlite | cut -f1`
echo " Created aa-ac.build.sqlite successfully ($SIZE kb)" | tee -a $LOG
fi
if [ -f "$ARCHIVE"/db/aa-ac.build.sqlite ]; then
date +'* Start Compressing aa-ac.sqlite Data Base (%c)' | tee -a $LOG
pbzip2 -f "$ARCHIVE"/db/aa-ac.build.sqlite
if [ $? -ne "0" ]; then
echo " Creation of aa-ac.sqlite.build.bz2 failed" | tee -a $LOG
rm -f $ARCHIVE/db/aa-ac.build.*
else
rm -f $ARCHIVE/db/aa-ac.sqlite.bz2
mv $ARCHIVE/db/aa-ac.build.sqlite.bz2 $ARCHIVE/db/aa-ac.sqlite.bz2
SIZE=`du -k $ARCHIVE/db/aa-ac.sqlite.bz2 | cut -f1`
echo " Created aa-ac.sqlite.bz2 successfully ($SIZE kb)" | tee -a $LOG
fi
fi
date +"* Start Creating full.sqlite Data Base (%c) using $SQLITE (data: $DBARCHIVE/data)" | tee -a $LOG
# Update full database
rm -f $ARCHIVE/db/full.build.*
"$CRAWLERHOME/scripts/merge_dbs" "$DBARCHIVE/data" "$ARCHIVE/db/full.build.sqlite" &> $LOGPREFIX-sqlite-full.log
if [ $? -ne "0" ]; then
echo " Creation of full.build.sqlite failed - see log file for details" | tee -a $LOG
else
SIZE=`du -k $ARCHIVE/db/full.build.sqlite | cut -f1`
echo " Created full.build.sqlite successfully ($SIZE kb)" | tee -a $LOG
fi
sync -f "$ARCHIVE"
sync -f "$ARCHIVE"
sync -f "$ARCHIVE"
if [ -f "$ARCHIVE"/db/full.build.sqlite ]; then
rm -f "$ARCHIVE"/db/full.sqlite
cp "$ARCHIVE"/db/full.build.sqlite "$ARCHIVE"/db/full.sqlite
sync -f "$ARCHIVE"
sync -f "$ARCHIVE"
sync -f "$ARCHIVE"
date +'* Start analysis (%c)' | tee -a $LOG
queries=`dirname $0`
sqlite3 "$ARCHIVE"/db/full.sqlite < $queries/../queries/get_added_permissions.sql | tee -a $LOGPREFIX-analysis.log | /usr/bin/mail -s "Extension Analysis" root
date +'* Analysis finished (%c)' | tee -a $LOG
date +'* Start Compressing full.build.sqlite Data Base (%c)' | tee -a $LOG
pbzip2 -f "$ARCHIVE"/db/full.build.sqlite
if [ $? -ne "0" ]; then
rm -f $ARCHIVE/db/full.build.*
echo " Creation of full.sqlite.bz2 failed" | tee -a $LOG
else
mv $ARCHIVE/db/full.build.sqlite.bz2 $ARCHIVE/db/full.sqlite.bz2
SIZE=`du -k $ARCHIVE/db/full.sqlite.bz2 | cut -f1`
echo " Created full.sqlite.bz2 successfully ($SIZE kb)" | tee -a $LOG
fi
fi
# date +'* Start Compressing Log files (%c)'
# for f in $ARCHIVE/log/*.log; do
# pbzip2 -f $f &> /dev/null
# done
date +'* Update Finished (%c)' | tee -a $LOG

View File

@ -1,100 +0,0 @@
#!/usr/bin/env python3
import sqlite3
import sys
import os
import fnmatch
MAX_ATTACHED_DBS = 10
def merge_and_detach(con, currently_attached, tablenames):
for db in currently_attached:
for tablename in tablenames:
con.execute(
"INSERT OR IGNORE INTO {tablename} SELECT * from {dbname}.{tablename};".
format(tablename=tablename, dbname=db))
con.commit()
con.commit()
for db in currently_attached:
con.execute("DETACH DATABASE {}".format(db))
def get_tablenames(dbpath):
with sqlite3.connect(dbpath) as con:
create_strings = con.execute(
"""select name from sqlite_master where type='table'"""
""" and name NOT LIKE '%!_fts%' escape '!';""").fetchall()
return [x[0] for x in create_strings]
def merge_schema(con, dbpath):
con.execute("ATTACH DATABASE ? as schemadb;", (dbpath, ))
create_strings = con.execute(
"""select sql from schemadb.sqlite_master where """
"""(type='table' AND name NOT LIKE '%!_fts!_%' escape '!') OR """
"""type='trigger';""")
for (create_string, ) in create_strings:
print(create_string)
con.execute(create_string)
con.execute("DETACH DATABASE schemadb;")
def find(pattern, path):
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
yield os.path.join(root, name)
def help():
print("Usage: merge_dbs DBSPATH TODB")
print(" DBSPATH the folder containing the *.sqlite files")
print(" (searched recursivly)")
print(" TODB the destination sqlite file")
def main(argv):
if len(argv) != 2:
help()
sys.exit(1)
dbspath, todb = argv[:2]
print("Using sqlite3 version {}".format(sqlite3.sqlite_version))
if os.path.isdir(dbspath):
sqlitepaths = list(find("*.sqlite", dbspath))
else:
sqlitepaths = [dbspath]
firstdb = sqlitepaths[0]
tablenames = get_tablenames(firstdb)
with sqlite3.connect(todb) as con:
if con.execute("SELECT COUNT(*) FROM sqlite_master;").fetchone()[
0] == 0:
print("Merging schema from {}".format(firstdb))
merge_schema(con, firstdb)
con.execute("PRAGMA default_temp_store=2;")
con.execute("PRAGMA cache_size=20000;")
con.execute("PRAGMA temp_store=2;")
currently_attached = []
for i, dbpath in enumerate(sqlitepaths):
dbname = "db{}".format(i)
print("Attaching {}".format(dbpath))
con.execute("ATTACH DATABASE ? as ?", (dbpath, dbname))
currently_attached += [dbname]
if len(currently_attached) % MAX_ATTACHED_DBS == 0 or i + 1 == len(
sqlitepaths):
merge_and_detach(con, currently_attached, tablenames)
currently_attached = []
con.commit()
if __name__ == "__main__":
main(sys.argv[1:])

View File

@ -1,19 +0,0 @@
#!/bin/bash
#
#$ -j yes
#
# Example invocation:
# qsub merge_dbs DBDIR OUTDBPATH EXTENSIONCRAWLERPATH
# DBDIR base dir for the generated db files
# OUTDBPATH the generated full db
# EXTENSIONCRAWLERPATH ExtensionCrawler git repo
DBDIR="$1"
OUTDBPATH="$2"
EXTENSIONCRAWLERDIR="$3"
export PATH=~/bin:$PATH
export LD_LIBRARY_PATH=~/lib:${LD_LIBRARY_PATH:-}
set -u
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$DBDIR" "$OUTDBPATH"

View File

@ -1,33 +0,0 @@
#!/bin/bash
#
#$ -t 1-256
#$ -j yes
#
# Example invocation:
# qsub merge_each_db DBDIR OUTDBPATH EXTENSIONCRAWLERPATH
# DBDIR base dir for the generated db files
# OUTDBPATH path containing the three-letter-dirs
# EXTENSIONCRAWLERPATH ExtensionCrawler git repo
DBDIR="$1"
OUTDBPATH="$2"
EXTENSIONCRAWLERDIR="$3"
module -s load apps/python/conda 2> /dev/null
source activate mypython35
export PATH=~/bin:$PATH
export LD_LIBRARY_PATH=~/lib:${LD_LIBRARY_PATH:-}
function task_id_to_letter_256 {
ABC=abcdefghijklmnopqrstuvwxyz
let "I1 = (($1-1) / 16) % 16"
let "I2 = ($1-1) % 16"
echo ${ABC:$I1:1}${ABC:$I2:1}
}
set -u
find "$DBDIR" -name "$(task_id_to_letter_256 $SGE_TASK_ID)*.sqlite" -print0 | while IFS= read -r -d '' file; do
DBNAME=$(basename "$file")
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$file" "$OUTDBPATH/${DBNAME:0:3}/$DBNAME"
done

View File

@ -14,7 +14,7 @@ ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/out
echo "Pushing $BASEDIR to sharc.shef.ac.uk:$TARGETDIR/ExtensionCrawler ..."
rsync -zr "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"
rsync -zr --exclude "$BASEDIR/archive" "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"
echo "Starting job ..."
ssh sharc.shef.ac.uk \