Merge with upstream.
This commit is contained in:
commit
3c9ccb4da9
|
@ -34,14 +34,13 @@ import datetime
|
|||
import dateutil
|
||||
import dateutil.parser
|
||||
import requests
|
||||
import logging
|
||||
|
||||
from ExtensionCrawler.config import (
|
||||
const_review_payload, const_review_search_url, const_download_url,
|
||||
get_local_archive_dir, const_overview_url, const_support_url,
|
||||
const_support_payload, const_review_search_payload, const_review_url)
|
||||
from ExtensionCrawler.util import google_dos_protection, value_of
|
||||
from ExtensionCrawler.sqlite import db_file, update_sqlite_incremental
|
||||
from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_exception
|
||||
from ExtensionCrawler.db import update_db_incremental
|
||||
|
||||
|
||||
class Error(Exception):
|
||||
|
@ -223,11 +222,10 @@ def update_overview(tar, date, ext_id):
|
|||
res = None
|
||||
try:
|
||||
res = requests.get(const_overview_url(ext_id), timeout=10)
|
||||
logging.info(8 * " " +
|
||||
"* overview page: {}".format(str(res.status_code)))
|
||||
log_info("* overview page: {}".format(str(res.status_code)), 2, ext_id)
|
||||
store_request_text(tar, date, 'overview.html', res)
|
||||
except Exception as e:
|
||||
logging.exception("Exception when retrieving overview page")
|
||||
log_exception("Exception when retrieving overview page", 2, ext_id)
|
||||
write_text(tar, date, 'overview.html.exception',
|
||||
traceback.format_exc())
|
||||
return RequestResult(res, e)
|
||||
|
@ -266,8 +264,9 @@ def update_crx(archivedir, tmptardir, ext_id, date):
|
|||
stream=True,
|
||||
headers=headers,
|
||||
timeout=10)
|
||||
logging.info(8 * " " + "* crx archive (Last: {}): {}".format(
|
||||
value_of(last_crx_http_date, "n/a"), str(res.status_code)))
|
||||
log_info("* crx archive (Last: {}): {}".format(
|
||||
value_of(last_crx_http_date, "n/a"), str(res.status_code)), 2,
|
||||
ext_id)
|
||||
extfilename = os.path.basename(res.url)
|
||||
if re.search('&', extfilename):
|
||||
extfilename = "default.crx"
|
||||
|
@ -278,12 +277,12 @@ def update_crx(archivedir, tmptardir, ext_id, date):
|
|||
timeout=10,
|
||||
allow_redirects=True).headers.get('ETag')
|
||||
write_text(tmptardir, date, extfilename + ".etag", etag)
|
||||
logging.info(12 * " " +
|
||||
"- checking etag, last: {}".format(last_crx_etag))
|
||||
logging.info(12 * " " + " current: {}".format(etag))
|
||||
log_info("- checking etag, last: {}".format(last_crx_etag), 3,
|
||||
ext_id)
|
||||
log_info(" current: {}".format(etag), 3, ext_id)
|
||||
|
||||
if (etag is not "") and (etag != last_crx_etag):
|
||||
logging.info(12 * " " + "- downloading due to different etags")
|
||||
log_info("- downloading due to different etags", 3, ext_id)
|
||||
|
||||
res = requests.get(
|
||||
const_download_url().format(ext_id),
|
||||
|
@ -304,7 +303,7 @@ def update_crx(archivedir, tmptardir, ext_id, date):
|
|||
write_text(tmptardir, date, extfilename + ".etag",
|
||||
res.headers.get("ETag"))
|
||||
except Exception as e:
|
||||
logging.exception("Exception when updating crx")
|
||||
log_exception("Exception when updating crx", 3, ext_id)
|
||||
write_text(tmptardir, date, extfilename + ".exception",
|
||||
traceback.format_exc())
|
||||
return RequestResult(res, e)
|
||||
|
@ -330,8 +329,8 @@ def update_reviews(tar, date, ext_id):
|
|||
const_review_url(),
|
||||
data=const_review_payload(ext_id, "0", "100"),
|
||||
timeout=10)
|
||||
logging.info(8 * " " +
|
||||
"* review page 0-100: {}".format(str(res.status_code)))
|
||||
log_info("* review page 0-100: {}".format(str(res.status_code)), 2,
|
||||
ext_id)
|
||||
store_request_text(tar, date, 'reviews000-099.text', res)
|
||||
pages += [res.text]
|
||||
|
||||
|
@ -340,8 +339,8 @@ def update_reviews(tar, date, ext_id):
|
|||
const_review_url(),
|
||||
data=const_review_payload(ext_id, "100", "100"),
|
||||
timeout=10)
|
||||
logging.info(8 * " " + "* review page 100-200: {}".format(
|
||||
str(res.status_code)))
|
||||
log_info("* review page 100-200: {}".format(str(res.status_code)), 2,
|
||||
ext_id)
|
||||
store_request_text(tar, date, 'reviews100-199.text', res)
|
||||
pages += [res.text]
|
||||
|
||||
|
@ -354,11 +353,11 @@ def update_reviews(tar, date, ext_id):
|
|||
const_review_search_url(),
|
||||
data=const_review_search_payload(ext_id_author_tups),
|
||||
timeout=10)
|
||||
logging.info(8 * " " + "* review page replies: {}".format(
|
||||
str(res.status_code)))
|
||||
log_info("* review page replies: {}".format(str(res.status_code)),
|
||||
2, ext_id)
|
||||
store_request_text(tar, date, 'reviewsreplies.text', res)
|
||||
except Exception as e:
|
||||
logging.exception("Exception when updating reviews")
|
||||
log_exception("Exception when updating reviews", 2, ext_id)
|
||||
write_text(tar, date, 'reviews.html.exception', traceback.format_exc())
|
||||
return RequestResult(res, e)
|
||||
return RequestResult(res)
|
||||
|
@ -374,8 +373,8 @@ def update_support(tar, date, ext_id):
|
|||
const_support_url(),
|
||||
data=const_support_payload(ext_id, "0", "100"),
|
||||
timeout=10)
|
||||
logging.info(8 * " " +
|
||||
"* support page 0-100: {}".format(str(res.status_code)))
|
||||
log_info("* support page 0-100: {}".format(str(res.status_code)), 2,
|
||||
ext_id)
|
||||
store_request_text(tar, date, 'support000-099.text', res)
|
||||
pages += [res.text]
|
||||
|
||||
|
@ -384,8 +383,8 @@ def update_support(tar, date, ext_id):
|
|||
const_support_url(),
|
||||
data=const_support_payload(ext_id, "100", "100"),
|
||||
timeout=10)
|
||||
logging.info(8 * " " +
|
||||
"* support page 100-200: {}".format(str(res.status_code)))
|
||||
log_info("* support page 100-200: {}".format(str(res.status_code)), 2,
|
||||
ext_id)
|
||||
store_request_text(tar, date, 'support100-199.text', res)
|
||||
pages += [res.text]
|
||||
|
||||
|
@ -398,19 +397,19 @@ def update_support(tar, date, ext_id):
|
|||
const_review_search_url(),
|
||||
data=const_review_search_payload(ext_id_author_tups),
|
||||
timeout=10)
|
||||
logging.info(8 * " " + "* support page replies: {}".format(
|
||||
str(res.status_code)))
|
||||
log_info("* support page replies: {}".format(str(res.status_code)),
|
||||
2, ext_id)
|
||||
store_request_text(tar, date, 'supportreplies.text', res)
|
||||
except Exception as e:
|
||||
logging.exception("Exception when updating support pages")
|
||||
log_exception("Exception when updating support pages", 2, ext_id)
|
||||
write_text(tar, date, 'support.html.exception', traceback.format_exc())
|
||||
return RequestResult(res, e)
|
||||
return RequestResult(res)
|
||||
|
||||
|
||||
def update_extension(archivedir, forums, ext_id):
|
||||
logging.info(4 * " " + "Updating extension {}{}".format(
|
||||
ext_id, " (including forums)" if forums else ""))
|
||||
log_info("Updating extension {}".format(" (including forums)"
|
||||
if forums else ""), 1, ext_id)
|
||||
is_new = False
|
||||
tar_exception = None
|
||||
sql_exception = None
|
||||
|
@ -426,12 +425,12 @@ def update_extension(archivedir, forums, ext_id):
|
|||
try:
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
tmptardir = os.path.join(tmpdir, ext_id)
|
||||
logging.info(8 * " " + "* tmptardir = {}".format(tmptardir))
|
||||
log_info("* tmptardir = {}".format(tmptardir), 2, ext_id)
|
||||
os.makedirs(
|
||||
os.path.join(archivedir, get_local_archive_dir(ext_id)),
|
||||
exist_ok=True)
|
||||
except Exception as e:
|
||||
logging.exception(8 * " " + "* FATAL: cannot create tmpdir")
|
||||
log_exception("* FATAL: cannot create tmpdir", 3, ext_id)
|
||||
tar_exception = e
|
||||
return UpdateResult(ext_id, is_new, tar_exception, None, None, None,
|
||||
None, sql_exception, False)
|
||||
|
@ -462,8 +461,7 @@ def update_extension(archivedir, forums, ext_id):
|
|||
if os.path.exists(tar):
|
||||
shutil.copyfile(tar, tardir + ".bak.tar")
|
||||
except Exception as e:
|
||||
logging.exception(8 * " " +
|
||||
"* FATAL: cannot rename old tar archive")
|
||||
log_exception("* FATAL: cannot rename old tar archive", 3, ext_id)
|
||||
tar_exception = e
|
||||
try:
|
||||
write_text(tardir, date, ext_id + ".tar.rename.exception",
|
||||
|
@ -478,7 +476,7 @@ def update_extension(archivedir, forums, ext_id):
|
|||
ar.add(tmptardir, arcname=ext_id)
|
||||
ar.close()
|
||||
except Exception as e:
|
||||
logging.exception(8 * " " + "* FATAL: cannot create tar archive")
|
||||
log_exception("* FATAL: cannot create tar archive", 3, ext_id)
|
||||
tar_exception = e
|
||||
try:
|
||||
write_text(tardir, date, ext_id + ".tar.create.exception",
|
||||
|
@ -487,12 +485,10 @@ def update_extension(archivedir, forums, ext_id):
|
|||
pass
|
||||
|
||||
try:
|
||||
logging.info(8 * " " + "* Updating db...")
|
||||
db_path = db_file(archivedir, ext_id)
|
||||
update_sqlite_incremental(db_path, tmptardir, ext_id, date)
|
||||
update_db_incremental(tmptardir, ext_id, date)
|
||||
sql_success = True
|
||||
except Exception as e:
|
||||
logging.exception(8 * " " + "* Exception during update of sqlite db")
|
||||
log_exception("* Exception during update of db", 3, ext_id)
|
||||
sql_exception = e
|
||||
|
||||
try:
|
||||
|
@ -503,7 +499,7 @@ def update_extension(archivedir, forums, ext_id):
|
|||
try:
|
||||
shutil.rmtree(path=tmpdir)
|
||||
except Exception as e:
|
||||
logging.exception(8 * " " + "* FATAL: cannot remove archive directory")
|
||||
log_exception("* FATAL: cannot remove archive directory", 3, ext_id)
|
||||
tar_exception = e
|
||||
try:
|
||||
write_text(tardir, date, ext_id + ".dir.remove.exception",
|
||||
|
@ -511,8 +507,11 @@ def update_extension(archivedir, forums, ext_id):
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
logging.info(8 * " " + "* Duration: {}".format(
|
||||
datetime.timedelta(seconds=int(time.time() - start))))
|
||||
log_info(
|
||||
"* Duration: {}".format(
|
||||
datetime.timedelta(seconds=int(time.time() - start))),
|
||||
2,
|
||||
ext_id)
|
||||
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
|
||||
res_reviews, res_support, sql_exception, sql_success)
|
||||
|
||||
|
@ -522,22 +521,20 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids):
|
|||
ext_without_forums = []
|
||||
ext_ids = list(set(ext_ids) - set(forums_ext_ids))
|
||||
forums_ext_ids = list(set(forums_ext_ids))
|
||||
logging.info("Updating {} extensions ({} including forums)".format(
|
||||
log_info("Updating {} extensions ({} including forums)".format(
|
||||
len(ext_ids), len(forums_ext_ids)))
|
||||
# First, update extensions with forums sequentially (and with delays) to
|
||||
# avoid running into Googles DDOS detection.
|
||||
logging.info(2 * " " +
|
||||
"Updating {} extensions including forums (sequentially))".
|
||||
format(len(forums_ext_ids)))
|
||||
log_info("Updating {} extensions including forums (sequentially))".format(
|
||||
len(forums_ext_ids)), 1)
|
||||
|
||||
ext_with_forums = list(
|
||||
map(partial(update_extension, archivedir, True), forums_ext_ids))
|
||||
|
||||
# Second, update extensions without forums parallel to increase speed.
|
||||
parallel_ids = list(set(ext_ids) - set(forums_ext_ids))
|
||||
logging.info(2 * " " +
|
||||
"Updating {} extensions excluding forums (parallel))".format(
|
||||
len(parallel_ids)))
|
||||
log_info("Updating {} extensions excluding forums (parallel))".format(
|
||||
len(parallel_ids)), 1)
|
||||
with Pool(parallel) as p:
|
||||
ext_without_forums = list(
|
||||
p.map(partial(update_extension, archivedir, False), parallel_ids))
|
||||
|
|
|
@ -125,15 +125,6 @@ def archive_file(archivedir, ext_id):
|
|||
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
|
||||
|
||||
|
||||
def db_file(archivedir, ext_id):
|
||||
"""DB (sqlite) file of an extension."""
|
||||
return os.path.join(archivedir,
|
||||
get_local_archive_dir(ext_id), ext_id + ".sqlite")
|
||||
|
||||
def jsloc_timeout():
|
||||
"""Maximum number of seconds for counting jsloc per extension."""
|
||||
return 600
|
||||
|
||||
def const_basedir():
|
||||
"""Top-level directory for the extension crawler archive."""
|
||||
return "archive"
|
||||
|
@ -153,14 +144,11 @@ def const_discover():
|
|||
"""Default configuration of discovery mode"""
|
||||
return False
|
||||
|
||||
def const_use_mysql():
|
||||
return False
|
||||
|
||||
def const_mysql_config_file():
|
||||
return "~/.my.cnf"
|
||||
|
||||
def const_mysql_maxtries():
|
||||
return 3
|
||||
return 30
|
||||
|
||||
def const_mysql_try_wait():
|
||||
return 2
|
||||
return 10
|
||||
|
|
|
@ -21,7 +21,6 @@ from ExtensionCrawler.crx import *
|
|||
from ExtensionCrawler.archive import *
|
||||
from ExtensionCrawler.js_decomposer import decompose_js, DetectionType, FileClassification
|
||||
|
||||
from ExtensionCrawler.dbbackend.sqlite_backend import SqliteBackend
|
||||
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
|
||||
|
||||
import re
|
||||
|
@ -31,7 +30,7 @@ import json
|
|||
import os
|
||||
import glob
|
||||
import datetime
|
||||
import logging
|
||||
import hashlib
|
||||
|
||||
|
||||
def get_etag(ext_id, datepath, con):
|
||||
|
@ -53,8 +52,8 @@ def get_etag(ext_id, datepath, con):
|
|||
if "ETag" in headers:
|
||||
return headers["ETag"]
|
||||
except Exception:
|
||||
logging.warning(16 * " " +
|
||||
"* WARNING: could not parse crx header file")
|
||||
log_warning("* WARNING: could not parse crx header file", 3,
|
||||
ext_id)
|
||||
|
||||
# Trying to look up previous etag in database
|
||||
linkpath = next(
|
||||
|
@ -64,8 +63,7 @@ def get_etag(ext_id, datepath, con):
|
|||
link = f.read()
|
||||
linked_date = link[3:].split("/")[0]
|
||||
|
||||
result = con.get_most_recent_etag(ext_id,
|
||||
con.convert_date(linked_date))
|
||||
result = con.get_etag(ext_id, con.convert_date(linked_date))
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
|
@ -100,7 +98,7 @@ def get_crx_status(datepath):
|
|||
|
||||
|
||||
def parse_and_insert_overview(ext_id, date, datepath, con):
|
||||
logging.info(16 * " " + "- parsing overview file")
|
||||
log_debug("- parsing overview file", 3, ext_id)
|
||||
overview_path = os.path.join(datepath, "overview.html")
|
||||
if os.path.exists(overview_path):
|
||||
with open(overview_path) as overview_file:
|
||||
|
@ -187,13 +185,14 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
|
|||
"category",
|
||||
extid=ext_id,
|
||||
date=con.convert_date(date),
|
||||
category_md5=hashlib.md5(category.encode()).digest(),
|
||||
category=category)
|
||||
|
||||
|
||||
def parse_and_insert_crx(ext_id, date, datepath, con):
|
||||
logging.info(16 * " " + "- parsing crx file")
|
||||
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
|
||||
if crx_path:
|
||||
log_debug("- parsing crx file", 3, ext_id)
|
||||
filename = os.path.basename(crx_path)
|
||||
|
||||
with ZipFile(crx_path) as f:
|
||||
|
@ -235,6 +234,8 @@ def parse_and_insert_crx(ext_id, date, datepath, con):
|
|||
con.insert(
|
||||
"permission",
|
||||
crx_etag=etag,
|
||||
permission_md5=hashlib.md5(
|
||||
str(permission).encode()).digest(),
|
||||
permission=str(permission))
|
||||
if "content_scripts" in manifest:
|
||||
for csd in manifest["content_scripts"]:
|
||||
|
@ -243,21 +244,26 @@ def parse_and_insert_crx(ext_id, date, datepath, con):
|
|||
con.insert(
|
||||
"content_script_url",
|
||||
crx_etag=etag,
|
||||
url_md5=hashlib.md5(
|
||||
str(urlpattern).encode()).digest(),
|
||||
url=str(urlpattern))
|
||||
|
||||
js_files = decompose_js(f)
|
||||
for js_file_info in js_files:
|
||||
# TODO: Add: evidenceStartPos, evidenceEndPos, and EvidenceText, sha1
|
||||
# TODO: md5, sha1, size, path, type, detect_method, crx_etag, filename should be non-null
|
||||
con.insert(
|
||||
"jsfile",
|
||||
crx_etag=etag,
|
||||
detect_method=(js_file_info['detectionMethod']).value,
|
||||
evidence_start_pos=str(js_file_info['evidenceStartPos']),
|
||||
evidence_end_pos=str(js_file_info['evidenceEndPos']),
|
||||
evidence_text=str(js_file_info['evidenceText']),
|
||||
filename=js_file_info['jsFilename'],
|
||||
type=(js_file_info['type']).value,
|
||||
lib=js_file_info['lib'],
|
||||
path=js_file_info['path'],
|
||||
encoding=js_file_info['encoding'],
|
||||
md5=js_file_info['md5'],
|
||||
sha1=js_file_info['sha1'],
|
||||
size=js_file_info['size'],
|
||||
version=js_file_info['version'])
|
||||
|
||||
|
@ -268,7 +274,7 @@ def get(d, k):
|
|||
|
||||
|
||||
def parse_and_insert_review(ext_id, date, reviewpath, con):
|
||||
logging.info(16 * " " + "- parsing review file")
|
||||
log_debug("- parsing review file", 3, ext_id)
|
||||
with open(reviewpath) as f:
|
||||
content = f.read()
|
||||
stripped = content[content.find('{"'):]
|
||||
|
@ -304,7 +310,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con):
|
|||
|
||||
|
||||
def parse_and_insert_support(ext_id, date, supportpath, con):
|
||||
logging.info(16 * " " + "- parsing support file")
|
||||
log_debug("- parsing support file", 3, ext_id)
|
||||
with open(supportpath) as f:
|
||||
content = f.read()
|
||||
stripped = content[content.find('{"'):]
|
||||
|
@ -340,12 +346,13 @@ def parse_and_insert_support(ext_id, date, supportpath, con):
|
|||
|
||||
|
||||
def parse_and_insert_replies(ext_id, date, repliespath, con):
|
||||
logging.info(16 * " " + "- parsing reply file")
|
||||
log_debug("- parsing reply file", 3, ext_id)
|
||||
with open(repliespath) as f:
|
||||
d = json.load(f)
|
||||
if not "searchResults" in d:
|
||||
logging.warning("* WARNING: there are no search results in {}".
|
||||
format(repliespath))
|
||||
log_warning("* WARNING: there are no search results in {}".format(
|
||||
repliespath), 3, ext_id)
|
||||
return
|
||||
results = []
|
||||
for result in d["searchResults"]:
|
||||
if "annotations" not in result:
|
||||
|
@ -380,7 +387,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
|
|||
|
||||
|
||||
def parse_and_insert_status(ext_id, date, datepath, con):
|
||||
logging.info(16 * " " + "- parsing status file")
|
||||
log_debug("- parsing status file", 3, ext_id)
|
||||
overview_status = get_overview_status(datepath)
|
||||
crx_status = get_crx_status(datepath)
|
||||
|
||||
|
@ -399,31 +406,26 @@ def parse_and_insert_status(ext_id, date, datepath, con):
|
|||
overview_exception=overview_exception)
|
||||
|
||||
|
||||
def update_sqlite_incremental(db_path, tmptardir, ext_id, date):
|
||||
logging.info(12 * " " + "- parsing data from {}".format(date))
|
||||
def update_db_incremental(tmptardir, ext_id, date):
|
||||
log_info("* Updating db with data from from {}".format(date), 2, ext_id)
|
||||
datepath = os.path.join(tmptardir, date)
|
||||
|
||||
if const_use_mysql():
|
||||
# Don't forget to create a ~/.my.cnf file with the credentials
|
||||
backend = MysqlBackend(read_default_file=const_mysql_config_file())
|
||||
else:
|
||||
backend = SqliteBackend(db_path)
|
||||
|
||||
with backend as con:
|
||||
# Don't forget to create a ~/.my.cnf file with the credentials
|
||||
with MysqlBackend(
|
||||
ext_id, read_default_file=const_mysql_config_file()) as con:
|
||||
etag = get_etag(ext_id, datepath, con)
|
||||
|
||||
if etag:
|
||||
try:
|
||||
parse_and_insert_crx(ext_id, date, datepath, con)
|
||||
except zipfile.BadZipfile as e:
|
||||
logging.warning(
|
||||
16 * " " +
|
||||
log_warning(
|
||||
"* WARNING: the found crx file is not a zip file, exception: {}".
|
||||
format(str(e)))
|
||||
format(str(e)), 3, ext_id)
|
||||
else:
|
||||
crx_status = get_crx_status(datepath)
|
||||
if crx_status != 401 and crx_status != 204 and crx_status != 404:
|
||||
logging.warning(16 * " " + "* WARNING: could not find etag")
|
||||
log_warning("* WARNING: could not find etag", 3, ext_id)
|
||||
|
||||
parse_and_insert_overview(ext_id, date, datepath, con)
|
||||
parse_and_insert_status(ext_id, date, datepath, con)
|
||||
|
@ -433,24 +435,24 @@ def update_sqlite_incremental(db_path, tmptardir, ext_id, date):
|
|||
try:
|
||||
parse_and_insert_review(ext_id, date, reviewpath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
logging.warning(16 * " " +
|
||||
"* Could not parse review file, exception: {}".
|
||||
format(str(e)))
|
||||
log_warning(
|
||||
"* Could not parse review file, exception: {}".format(
|
||||
str(e)), 3, ext_id)
|
||||
|
||||
supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
|
||||
for supportpath in supportpaths:
|
||||
try:
|
||||
parse_and_insert_support(ext_id, date, supportpath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
logging.warning(
|
||||
16 * " " + "* Could not parse support file, exception: {}".
|
||||
format(str(e)))
|
||||
log_warning(
|
||||
"* Could not parse support file, exception: {}".format(
|
||||
str(e)), 3, ext_id)
|
||||
|
||||
repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
|
||||
for repliespath in repliespaths:
|
||||
try:
|
||||
parse_and_insert_replies(ext_id, date, repliespath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
logging.warning(16 * " " +
|
||||
"* Could not parse reply file, exception: {}".
|
||||
format(str(e)))
|
||||
log_warning(
|
||||
"* Could not parse reply file, exception: {}".format(
|
||||
str(e)), 3, ext_id)
|
|
@ -20,6 +20,7 @@ import MySQLdb
|
|||
import _mysql_exceptions
|
||||
import atexit
|
||||
import time
|
||||
from ExtensionCrawler.util import log_info, log_error, log_exception
|
||||
|
||||
db = None
|
||||
|
||||
|
@ -31,38 +32,47 @@ def close_db():
|
|||
|
||||
atexit.register(close_db)
|
||||
|
||||
def retry(f):
|
||||
for t in range(const_mysql_maxtries()):
|
||||
try:
|
||||
return f()
|
||||
except _mysql_exceptions.OperationalError as e:
|
||||
last_exception = e
|
||||
if t + 1 == const_mysql_maxtries():
|
||||
raise last_exception
|
||||
else:
|
||||
time.sleep(const_mysql_try_wait())
|
||||
|
||||
|
||||
class MysqlBackend:
|
||||
def __init__(self, **kwargs):
|
||||
def retry(self, f):
|
||||
for t in range(const_mysql_maxtries()):
|
||||
try:
|
||||
return f()
|
||||
except _mysql_exceptions.OperationalError as e:
|
||||
last_exception = e
|
||||
if t + 1 == const_mysql_maxtries():
|
||||
log_exception("MySQL connection eventually failed!", 3,
|
||||
self.ext_id)
|
||||
raise last_exception
|
||||
else:
|
||||
log_exception(
|
||||
"""Exception on mysql connection attempt {} of {}, """
|
||||
"""wating {}s before retrying...""".format(
|
||||
t + 1,
|
||||
const_mysql_maxtries(),
|
||||
const_mysql_try_wait()), 3, self.ext_id)
|
||||
time.sleep(const_mysql_try_wait())
|
||||
|
||||
def __init__(self, ext_id, **kwargs):
|
||||
self.ext_id = ext_id
|
||||
self.dbargs = kwargs
|
||||
|
||||
def __enter__(self):
|
||||
global db
|
||||
if db is None:
|
||||
db = retry(lambda: MySQLdb.connect(**self.dbargs))
|
||||
self.cursor = retry(lambda: db.cursor())
|
||||
db = self.retry(lambda: MySQLdb.connect(**self.dbargs))
|
||||
self.cursor = self.retry(lambda: db.cursor())
|
||||
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
retry(lambda: db.commit())
|
||||
retry(lambda: self.cursor.close())
|
||||
self.retry(lambda: db.commit())
|
||||
self.retry(lambda: self.cursor.close())
|
||||
|
||||
def get_single_value(self, query, args):
|
||||
retry(lambda: self.cursor.execute(query, args))
|
||||
self.retry(lambda: self.cursor.execute(query, args))
|
||||
|
||||
result = retry(lambda: self.cursor.fetchone())
|
||||
result = self.retry(lambda: self.cursor.fetchone())
|
||||
if result is not None:
|
||||
return result[0]
|
||||
else:
|
||||
|
@ -81,15 +91,14 @@ class MysqlBackend:
|
|||
",".join(len(args[0]) * ["%s"]),
|
||||
",".join(
|
||||
["{c}=VALUES({c})".format(c=c) for c in arglist[0].keys()]))
|
||||
retry(lambda: self.cursor.executemany(query, args))
|
||||
self.retry(lambda: self.cursor.executemany(query, args))
|
||||
|
||||
def insert(self, table, **kwargs):
|
||||
self.insertmany(table, [kwargs])
|
||||
|
||||
def get_most_recent_etag(self, extid, date):
|
||||
def get_etag(self, extid, date):
|
||||
return self.get_single_value(
|
||||
"""SELECT crx_etag from extension e1 where extid=%s and date<%s and not exists """
|
||||
"""(select 1 from extension e2 where e2.extid=e1.extid and e2.date<e1.date)""",
|
||||
"""SELECT crx_etag from extension where extid=%s and date=%s""",
|
||||
(extid, date))
|
||||
|
||||
def convert_date(self, date):
|
||||
|
|
|
@ -1,167 +0,0 @@
|
|||
#
|
||||
# Copyright (C) 2017 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
|
||||
def setup_fts_tables(con, name, columns, primary_columns):
|
||||
sqls = [
|
||||
s.format(
|
||||
name=name,
|
||||
columns=", ".join(columns),
|
||||
new_columns=", ".join(["new." + x for x in columns]),
|
||||
primary_columns=", ".join(primary_columns))
|
||||
for s in [
|
||||
"""CREATE TABLE {name}({columns}, PRIMARY KEY ({primary_columns}));""",
|
||||
"""CREATE VIRTUAL TABLE {name}_fts using fts4(content="{name}", {columns});""",
|
||||
"""CREATE TRIGGER {name}_bu BEFORE UPDATE ON {name} BEGIN """
|
||||
"""DELETE FROM {name}_fts WHERE docid=old.rowid;"""
|
||||
"""END;""",
|
||||
"""CREATE TRIGGER {name}_bd BEFORE DELETE ON {name} BEGIN """
|
||||
"""DELETE FROM {name}_fts WHERE docid=old.rowid;"""
|
||||
"""END;""",
|
||||
"""CREATE TRIGGER {name}_au AFTER UPDATE ON {name} BEGIN """
|
||||
"""INSERT INTO {name}_fts(docid, {columns}) VALUES(new.rowid, {new_columns});"""
|
||||
"""END;""",
|
||||
"""CREATE TRIGGER {name}_ai AFTER INSERT ON {name} BEGIN """
|
||||
"""INSERT INTO {name}_fts(docid, {columns}) VALUES(new.rowid, {new_columns});"""
|
||||
"""END;"""
|
||||
]
|
||||
]
|
||||
for sql in sqls:
|
||||
con.execute(sql)
|
||||
|
||||
|
||||
def setup_tables(con):
|
||||
setup_fts_tables(con, "support", [
|
||||
"author", "commentdate", "extid", "date", "displayname", "title",
|
||||
"language", "shortauthor", "comment"
|
||||
], ["author", "commentdate", "extid", "date"])
|
||||
|
||||
setup_fts_tables(con, "review", [
|
||||
"author", "commentdate", "extid", "date", "displayname", "rating",
|
||||
"language", "shortauthor", "comment"
|
||||
], ["author", "commentdate", "extid", "date"])
|
||||
|
||||
setup_fts_tables(con, "reply", [
|
||||
"author", "commentdate", "extid", "date", "displayname", "replyto",
|
||||
"language", "shortauthor", "comment"
|
||||
], ["author", "commentdate", "extid", "date"])
|
||||
|
||||
con.execute("""CREATE TABLE category ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""category TEXT,"""
|
||||
"""PRIMARY KEY (extid, date, category)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE content_script_url ("""
|
||||
"""crx_etag TEXT,"""
|
||||
"""url TEXT,"""
|
||||
"""PRIMARY KEY (crx_etag, url)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE permission ("""
|
||||
"""crx_etag TEXT,"""
|
||||
"""permission TEXT,"""
|
||||
"""PRIMARY KEY (crx_etag, permission)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE crx ("""
|
||||
"""crx_etag TEXT PRIMARY KEY,"""
|
||||
"""filename TEXT,"""
|
||||
"""size INTEGER,"""
|
||||
"""publickey BLOB"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE jsfile ("""
|
||||
"""crx_etag TEXT,"""
|
||||
"""detect_method TEXT,"""
|
||||
"""filename TEXT,"""
|
||||
"""type TEXT,"""
|
||||
"""lib TEXT,"""
|
||||
"""path TEXT,"""
|
||||
"""md5 TEXT,"""
|
||||
"""size INTEGER,"""
|
||||
"""version TEXT,"""
|
||||
"""PRIMARY KEY (crx_etag, path)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE status ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""crx_status INTEGER,"""
|
||||
"""overview_status INTEGER,"""
|
||||
"""overview_exception TEXT,"""
|
||||
"""PRIMARY KEY (extid, date)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE extension ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""name TEXT,"""
|
||||
"""version TEXT,"""
|
||||
"""description TEXT,"""
|
||||
"""downloads INTEGER,"""
|
||||
"""rating REAL,"""
|
||||
"""ratingcount INTEGER,"""
|
||||
"""fulldescription TEXT,"""
|
||||
"""developer TEXT,"""
|
||||
"""itemcategory TEXT,"""
|
||||
"""crx_etag TEXT,"""
|
||||
"""lastupdated TEXT,"""
|
||||
"""PRIMARY KEY (extid, date),"""
|
||||
"""FOREIGN KEY (crx_etag) REFERENCES crx(crx_etag)"""
|
||||
""")""")
|
||||
|
||||
|
||||
class SqliteBackend:
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
|
||||
def __enter__(self):
|
||||
new_db = False
|
||||
if not os.path.exists(self.filename):
|
||||
new_db = True
|
||||
self.con = sqlite3.connect(self.filename)
|
||||
if new_db:
|
||||
setup_tables(self.con)
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.con.commit()
|
||||
self.con.close()
|
||||
|
||||
def get_single_value(self, query, args):
|
||||
result = next(self.con.execute(query, args), None)
|
||||
if result is not None:
|
||||
return result[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
def insert(self, table, **kwargs):
|
||||
args = tuple(kwargs.values())
|
||||
self.con.execute("INSERT OR REPLACE INTO {}({}) VALUES ({})".format(
|
||||
table, ",".join(kwargs.keys()), ",".join(len(args) * ["?"])), args)
|
||||
|
||||
def insertmany(self, table, argslist):
|
||||
for arg in argslist:
|
||||
self.insert(table, **arg)
|
||||
|
||||
def get_most_recent_etag(self, extid, date):
|
||||
return self.get_single_value(
|
||||
"""SELECT crx_etag from extension e1 where extid=? and date<? and not exists """
|
||||
"""(select 1 from extension e2 where e2.extid=e1.extid and e2.date<e1.date)""",
|
||||
(extid, date))
|
||||
|
||||
def convert_date(self, date):
|
||||
return date
|
|
@ -23,7 +23,7 @@ import re
|
|||
from functools import reduce
|
||||
import requests
|
||||
import ExtensionCrawler.config
|
||||
import logging
|
||||
from ExtensionCrawler.util import log_info, log_exception
|
||||
|
||||
|
||||
def crawl_nearly_all_of_ext_ids():
|
||||
|
@ -45,24 +45,24 @@ def crawl_nearly_all_of_ext_ids():
|
|||
# The urls with a language parameter attached return a subset
|
||||
# of the ids that get returned by the plain urls, therefore we
|
||||
# skip urls with a language parameter
|
||||
filter(is_generic_url, ([elem.text for elem in shard_elems])))
|
||||
filter(is_generic_url, ([elem.text for elem in shard_elems])))[:1]
|
||||
shards = list(map(lambda u: requests.get(u, timeout=10).text, shard_urls))
|
||||
|
||||
overview_urls = reduce(
|
||||
lambda x, y: x + y,
|
||||
map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
|
||||
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
|
||||
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls][:10]
|
||||
|
||||
|
||||
def get_new_ids(known_ids):
|
||||
"""Discover new extension ids."""
|
||||
logging.info("Discovering new ids ...")
|
||||
log_info("Discovering new ids ...")
|
||||
discovered_ids = []
|
||||
try:
|
||||
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
|
||||
except Exception:
|
||||
logging.exception("Exception when discovering new ids")
|
||||
log_exception("Exception when discovering new ids")
|
||||
new_ids = list(set(discovered_ids) - set(known_ids))
|
||||
logging.info(2 * " " + "Discovered {} new extensions (out of {})".format(
|
||||
len(new_ids), len(discovered_ids)))
|
||||
log_info("Discovered {} new extensions (out of {})".format(
|
||||
len(new_ids), len(discovered_ids)), 1)
|
||||
return new_ids
|
||||
|
|
|
@ -111,9 +111,9 @@ def init_jsinfo(zipfile, js_file):
|
|||
'evidenceText': None,
|
||||
'encoding': chardet.detect(data)['encoding'],
|
||||
'jsFilename': js_filename,
|
||||
'md5': hashlib.md5(data).hexdigest(),
|
||||
'sha1': hashlib.sha1(data).hexdigest(),
|
||||
'size': file_size,
|
||||
'md5': hashlib.md5(data).digest(),
|
||||
'sha1': hashlib.sha1(data).digest(),
|
||||
'size': file_size,
|
||||
'path': path
|
||||
}
|
||||
if js_info['size'] == 0:
|
||||
|
@ -218,7 +218,7 @@ def analyse_comment_generic_libs(zipfile, js_file, js_info, comment):
|
|||
filename = js_file.filename
|
||||
else:
|
||||
filename = js_file
|
||||
|
||||
|
||||
for unkregex in unknown_lib_identifiers():
|
||||
unkown_lib_matched = unkregex.finditer(comment.content)
|
||||
for match in unkown_lib_matched:
|
||||
|
@ -272,7 +272,7 @@ def decompose_js(file):
|
|||
if isinstance(file, str):
|
||||
js_files = [file]
|
||||
else:
|
||||
zipfile = file
|
||||
zipfile = file
|
||||
js_files = list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist()))
|
||||
|
||||
for js_file in js_files:
|
||||
|
|
|
@ -20,6 +20,8 @@
|
|||
|
||||
from time import sleep
|
||||
from random import random
|
||||
import traceback
|
||||
import logging
|
||||
|
||||
def google_dos_protection(maxrange=0.3):
|
||||
"""Wait a random number of seconds (between 0.5 to 0.5+maxrange)
|
||||
|
@ -32,3 +34,20 @@ def value_of(value, default):
|
|||
return value
|
||||
else:
|
||||
return default
|
||||
|
||||
def log_debug(msg, indent_level=0, extid="-" * 32):
|
||||
logging.debug(str(extid) + " " + 4 * indent_level * " " + str(msg))
|
||||
|
||||
def log_info(msg, indent_level=0, extid="-" * 32):
|
||||
logging.info(str(extid) + " " + 4 * indent_level * " " + str(msg))
|
||||
|
||||
def log_warning(msg, indent_level=0, extid="-" * 32):
|
||||
logging.warning(str(extid) + " " + 4 * indent_level * " " + str(msg))
|
||||
|
||||
def log_error(msg, indent_level=0, extid="-" * 32):
|
||||
logging.error(str(extid) + " " + 4 * indent_level * " " + str(msg))
|
||||
|
||||
def log_exception(msg, indent_level=0, extid="-" * 32):
|
||||
logging.error(str(extid) + " " + 4 * indent_level * " " + str(msg))
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logging.error(str(extid) + " " + 4 * indent_level * " " + line)
|
||||
|
|
18
README.md
18
README.md
|
@ -9,10 +9,13 @@ extension from the Chrome Web store.
|
|||
will check the integrity of the extension.
|
||||
* `crx-extract`: A simple tool for extracting `*.crx` files from the
|
||||
tar-based archive hierarchy.
|
||||
* `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a
|
||||
* `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a
|
||||
JavaScript decomposition analysis.
|
||||
* `create-db`: A tool for creating/initializing the database files
|
||||
from already existing extension archives.
|
||||
* `crx-jsstrings`: A tool for extracting code blocks, comment blocks, and
|
||||
* string literals from JavaScript.
|
||||
* `create-db`: A tool for updating a remote MariaDB from already
|
||||
existing extension archives.
|
||||
>>>>>>> bde59c50401448e346032219ab2130a31b3d7ec8
|
||||
|
||||
The utilities store the extensions in the following directory
|
||||
hierarchy:
|
||||
|
@ -31,9 +34,12 @@ The crawler downloads the most recent extension (i.e., the `*.crx`
|
|||
file as well as the overview page. In addition, the `conf` directory
|
||||
may contain one file, called `forums.conf` that lists the ids of
|
||||
extensions for which the forums and support pages should be downloaded
|
||||
as well. The `data` directory will contain the downloaded extensions
|
||||
as well as sqlite files containing the extracted meta data. The sqlite
|
||||
files can easily be re-generated using the `create-db` tool.
|
||||
as well. The `data` directory will contain the downloaded extensions.
|
||||
|
||||
The `crawler` and `create-db` scripts will access and update a MariaDB.
|
||||
They will use the host, datebase, and credentials found in `~/.my.cnf`.
|
||||
Since they make use of various JSON features, it is recommended to use at
|
||||
least version 10.2.8 of MariaDB.
|
||||
|
||||
All utilities are written in Python 3.x. The required modules are listed
|
||||
in the file `requirements.txt`.
|
||||
|
|
52
crawler
52
crawler
|
@ -24,12 +24,12 @@ import sys
|
|||
import datetime
|
||||
import time
|
||||
import getopt
|
||||
import sqlite3
|
||||
import logging
|
||||
from functools import reduce
|
||||
from ExtensionCrawler.discover import get_new_ids
|
||||
from ExtensionCrawler.archive import get_forum_ext_ids, get_existing_ids, update_extensions
|
||||
from ExtensionCrawler.config import *
|
||||
from ExtensionCrawler.util import log_info
|
||||
|
||||
|
||||
def write_log(dirname, fname, text):
|
||||
|
@ -98,39 +98,39 @@ def log_summary(res, runtime=0):
|
|||
|
||||
corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
|
||||
|
||||
logging.info("Summary:")
|
||||
logging.info(" Updated {} out of {} extensions successfully".format(
|
||||
log_info("Summary:")
|
||||
log_info(" Updated {} out of {} extensions successfully".format(
|
||||
str(len(list(filter(lambda x: x.is_ok(), res)))), str(len(res))))
|
||||
logging.info(" Updated extensions: {:8d}".format(
|
||||
log_info(" Updated extensions: {:8d}".format(
|
||||
len(list(filter(lambda x: x.is_ok() and not x.not_modified(), res)))))
|
||||
logging.info(" Updated SQL databases: {:8d}".format(
|
||||
log_info(" Updated SQL databases: {:8d}".format(
|
||||
len(list(filter(lambda x: x.sql_success(), res)))))
|
||||
logging.info(" New extensions: {:8d}".format(
|
||||
log_info(" New extensions: {:8d}".format(
|
||||
len(list(filter(lambda x: x.is_new(), res)))))
|
||||
logging.info(" Not authorized: {:8d}".format(
|
||||
log_info(" Not authorized: {:8d}".format(
|
||||
len(list(filter(lambda x: x.not_authorized(), res)))))
|
||||
logging.info(" Raised Google DDOS: {:8d}".format(
|
||||
log_info(" Raised Google DDOS: {:8d}".format(
|
||||
len(list(filter(lambda x: x.raised_google_ddos(), res)))))
|
||||
logging.info(" Not modified archives: {:8d}".format(
|
||||
log_info(" Not modified archives: {:8d}".format(
|
||||
len(list(filter(lambda x: x.not_modified(), res)))))
|
||||
logging.info(" Extensions not in store: {:8d}".format(
|
||||
log_info(" Extensions not in store: {:8d}".format(
|
||||
len(list(filter(lambda x: x.not_in_store(), res)))))
|
||||
logging.info(" Unknown exception: {:8d}".format(
|
||||
log_info(" Unknown exception: {:8d}".format(
|
||||
len(list(filter(lambda x: x.has_exception(), res)))))
|
||||
logging.info(
|
||||
log_info(
|
||||
" Corrupt tar archives: {:8d}".format(len(corrupt_tar_archives)))
|
||||
logging.info(" SQL exception: {:8d}".format(
|
||||
log_info(" SQL exception: {:8d}".format(
|
||||
len(list(filter(lambda x: x.sql_exception(), res)))))
|
||||
logging.info(" Total runtime: {}".format(
|
||||
log_info(" Total runtime: {}".format(
|
||||
str(datetime.timedelta(seconds=int(runtime)))))
|
||||
|
||||
if corrupt_tar_archives != []:
|
||||
logging.info("")
|
||||
logging.info("List of extensions with corrupted files/archives:")
|
||||
log_info("")
|
||||
log_info("List of extensions with corrupted files/archives:")
|
||||
list(
|
||||
map(lambda x: logging.info(" " + x.id + ": " + str(x.exception), corrupt_tar_archives)
|
||||
map(lambda x: log_info(" " + x.id + ": " + str(x.exception), corrupt_tar_archives)
|
||||
))
|
||||
logging.info("")
|
||||
log_info("")
|
||||
|
||||
|
||||
def helpmsg():
|
||||
|
@ -144,14 +144,12 @@ def helpmsg():
|
|||
|
||||
def print_config(basedir, archive_dir, conf_dir, discover, parallel):
|
||||
"""Print current configuration."""
|
||||
logging.info("Configuration:")
|
||||
logging.info(" Base dir: {}".format(basedir))
|
||||
logging.info(" Archive directory: {}".format(archive_dir))
|
||||
logging.info(" Configuration directory: {}".format(conf_dir))
|
||||
logging.info(" Discover new extensions: {}".format(discover))
|
||||
logging.info(" Max num. of concurrent downloads: {}".format(parallel))
|
||||
logging.info(" SQLite 3 version: {}".format(
|
||||
sqlite3.sqlite_version))
|
||||
log_info("Configuration:")
|
||||
log_info(" Base dir: {}".format(basedir))
|
||||
log_info(" Archive directory: {}".format(archive_dir))
|
||||
log_info(" Configuration directory: {}".format(conf_dir))
|
||||
log_info(" Discover new extensions: {}".format(discover))
|
||||
log_info(" Max num. of concurrent downloads: {}".format(parallel))
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
|
@ -220,7 +218,7 @@ def main(argv):
|
|||
# they are often temporary
|
||||
has_exception = list(filter(lambda x: x.has_exception(), res))
|
||||
if has_exception != []:
|
||||
logging.info(
|
||||
log_info(
|
||||
" {} extensions with unknown exceptions, start another try ...".
|
||||
format(str(len(has_exception))))
|
||||
has_exception_ids = list(map(lambda x: x.id, has_exception))
|
||||
|
|
41
create-db
41
create-db
|
@ -24,16 +24,16 @@ import time
|
|||
import tempfile
|
||||
import fnmatch
|
||||
from multiprocessing import Pool
|
||||
from functools import partial
|
||||
import logging
|
||||
import datetime
|
||||
|
||||
from ExtensionCrawler.archive import update_sqlite_incremental
|
||||
from ExtensionCrawler.archive import update_db_incremental
|
||||
from ExtensionCrawler.config import *
|
||||
from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception
|
||||
|
||||
|
||||
def help():
|
||||
print("create-db [OPTION] DBBASEDIR")
|
||||
print(" DBBASEDIR directory for generated db files")
|
||||
print("create-db [OPTION]")
|
||||
print(" -h print this help text")
|
||||
print(" -a <DIR> archive directory")
|
||||
print(" -p <PREFIX> three-letter-prefix")
|
||||
|
@ -43,26 +43,28 @@ def help():
|
|||
print(" -N <MAXTASKID> ")
|
||||
|
||||
|
||||
def process_id(dbbasedir, path):
|
||||
def process_id(path):
|
||||
start = time.time()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with tarfile.open(path) as t:
|
||||
t.extractall(tmpdir)
|
||||
|
||||
extid = os.listdir(tmpdir)[0]
|
||||
logging.info("Processing {}".format(extid))
|
||||
dbpath = os.path.join(dbbasedir, extid + ".sqlite")
|
||||
if os.path.exists(dbpath):
|
||||
os.remove(dbpath)
|
||||
log_info("Start processing extension", 0, extid)
|
||||
iddir = os.path.join(tmpdir, extid)
|
||||
|
||||
for date in sorted(os.listdir(iddir)):
|
||||
try:
|
||||
update_sqlite_incremental(dbpath, iddir, extid, date)
|
||||
update_db_incremental(iddir, extid, date)
|
||||
except Exception:
|
||||
logging.exception("Exception when handling {} on {}".
|
||||
format(extid, date))
|
||||
logging.info("Finished {} in {}s".format(extid, time.time() - start))
|
||||
log_exception(
|
||||
"Exception when handling data from {}".format(date), 0,
|
||||
extid)
|
||||
log_info(
|
||||
"Finished extension in {}".format(
|
||||
str(datetime.timedelta(seconds=int(time.time() - start)))),
|
||||
0,
|
||||
extid)
|
||||
|
||||
|
||||
def find(archive, pattern):
|
||||
|
@ -116,13 +118,6 @@ def parse_args(argv):
|
|||
elif opt in ("-N", "--maxtaskid"):
|
||||
maxtaskid = int(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
help()
|
||||
sys.exit(2)
|
||||
|
||||
dbbasedir = args[0]
|
||||
|
||||
paths += args[1:]
|
||||
if paths == []:
|
||||
paths = list(find(archive, "*"))
|
||||
|
||||
|
@ -132,16 +127,16 @@ def parse_args(argv):
|
|||
else:
|
||||
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
|
||||
|
||||
return dbbasedir, paths, parallel
|
||||
return paths, parallel
|
||||
|
||||
|
||||
def main(argv):
|
||||
logging.basicConfig(level=logging.INFO, format=const_log_format())
|
||||
|
||||
dbbasedir, paths, parallel = parse_args(argv)
|
||||
paths, parallel = parse_args(argv)
|
||||
|
||||
with Pool(processes=parallel) as p:
|
||||
p.map(partial(process_id, dbbasedir), paths)
|
||||
p.map(process_id, paths)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Usage:
|
||||
# ./generate_small_db.sh [BASEDIR] [DBPATH] [EXTENSIONCRAWLER]
|
||||
# [BASEDIR] path to extension data, needs to contain aaa, aab, etc (defaults to sharc path)
|
||||
# [DBPATH] path to output db (defaults to ~/aa-ac.sqlite)
|
||||
# [EXTENSIONCRAWLER] path to git repo (defaults to ~/ExtensionCrawler)
|
||||
|
||||
BASEDIR=${1:-/shared/brucker_research1/Shared/BrowserExtensions/data}
|
||||
DBPATH=${2:-~/aa-ac.sqlite}
|
||||
EXTENSIONCRAWLER=${3:-~/ExtensionCrawler}
|
||||
|
||||
find "$BASEDIR" -mindepth 1 -maxdepth 1 -name "a[a-c]*" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs" "{}" "$DBPATH" \;
|
|
@ -1,13 +1,8 @@
|
|||
#!/bin/bash
|
||||
# m h dom mon dow command
|
||||
# 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
|
||||
# 33 01 * * * ~/ExtensionCrawler/scripts/global_update_db.sh
|
||||
# 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh
|
||||
|
||||
|
||||
export LD_LIBRARY_PATH=$HOME/local/lib:/usr/local/lib:$LD_LIBRARY_PATH
|
||||
export PATH=$HOME/local/bin:/usr/local/bin:$PATH
|
||||
|
||||
ARCHIVE=${1:-/srv/Shared/BrowserExtensions/}
|
||||
CRAWLERHOME=${2:-~/ExtensionCrawler}
|
||||
LOGPREFIX=$ARCHIVE/log/`date --iso-8601=ns`
|
||||
|
@ -15,11 +10,8 @@ LOG=$LOGPREFIX-global.log
|
|||
|
||||
date +'* Start Updating Extensions Archive (%c)' | tee $LOG
|
||||
|
||||
SQLITE=`which sqlite3`
|
||||
|
||||
# Update extensions
|
||||
(cd $CRAWLERHOME; (./crawler -d -a $ARCHIVE > $LOGPREFIX.log))
|
||||
|
||||
|
||||
date +'* Update Finished (%c)' | tee -a $LOG
|
||||
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
#!/bin/bash
|
||||
# m h dom mon dow command
|
||||
# 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
|
||||
# 33 01 * * * ~/ExtensionCrawler/scripts/global_update_db.sh
|
||||
# 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh
|
||||
|
||||
|
||||
export LD_LIBRARY_PATH=$HOME/local/lib:/usr/local/lib:$LD_LIBRARY_PATH
|
||||
export PATH=$HOME/local/bin:/usr/local/bin:$PATH
|
||||
|
||||
ARCHIVE=${1:-/srv/Shared/BrowserExtensions/}
|
||||
CRAWLERHOME=${2:-~/ExtensionCrawler}
|
||||
LOGPREFIX=$ARCHIVE/log/`date --iso-8601=ns`
|
||||
LOG=$LOGPREFIX-global-db.log
|
||||
|
||||
DBARCHIVE=`find $ARCHIVE/.snapshot -maxdepth 1 -mindepth 1 -name "D*" | head -n 1`
|
||||
|
||||
SQLITE=`which sqlite3`
|
||||
|
||||
date +"* Start Creating aa-ac.build.sqlite Data Base (%c) using $SQLITE (data: $DBARCHIVE/data)" | tee -a $LOG
|
||||
# Update small database
|
||||
rm -f $ARCHIVE/db/aa-ac.build.*
|
||||
|
||||
(cd $CRAWLERHOME; (./scripts/generate_small_db.sh $DBARCHIVE/data $ARCHIVE/db/aa-ac.build.sqlite &> $LOGPREFIX-sqlite-aa-ac.log))
|
||||
if [ $? -ne "0" ]; then
|
||||
echo " Creation of aa-ac.build.sqlite failed - see log file for details" | tee -a $LOG
|
||||
else
|
||||
SIZE=`du -k $ARCHIVE/db/aa-ac.build.sqlite | cut -f1`
|
||||
echo " Created aa-ac.build.sqlite successfully ($SIZE kb)" | tee -a $LOG
|
||||
fi
|
||||
|
||||
if [ -f "$ARCHIVE"/db/aa-ac.build.sqlite ]; then
|
||||
date +'* Start Compressing aa-ac.sqlite Data Base (%c)' | tee -a $LOG
|
||||
pbzip2 -f "$ARCHIVE"/db/aa-ac.build.sqlite
|
||||
if [ $? -ne "0" ]; then
|
||||
echo " Creation of aa-ac.sqlite.build.bz2 failed" | tee -a $LOG
|
||||
rm -f $ARCHIVE/db/aa-ac.build.*
|
||||
else
|
||||
rm -f $ARCHIVE/db/aa-ac.sqlite.bz2
|
||||
mv $ARCHIVE/db/aa-ac.build.sqlite.bz2 $ARCHIVE/db/aa-ac.sqlite.bz2
|
||||
SIZE=`du -k $ARCHIVE/db/aa-ac.sqlite.bz2 | cut -f1`
|
||||
echo " Created aa-ac.sqlite.bz2 successfully ($SIZE kb)" | tee -a $LOG
|
||||
fi
|
||||
fi
|
||||
|
||||
date +"* Start Creating full.sqlite Data Base (%c) using $SQLITE (data: $DBARCHIVE/data)" | tee -a $LOG
|
||||
# Update full database
|
||||
rm -f $ARCHIVE/db/full.build.*
|
||||
"$CRAWLERHOME/scripts/merge_dbs" "$DBARCHIVE/data" "$ARCHIVE/db/full.build.sqlite" &> $LOGPREFIX-sqlite-full.log
|
||||
if [ $? -ne "0" ]; then
|
||||
echo " Creation of full.build.sqlite failed - see log file for details" | tee -a $LOG
|
||||
else
|
||||
SIZE=`du -k $ARCHIVE/db/full.build.sqlite | cut -f1`
|
||||
echo " Created full.build.sqlite successfully ($SIZE kb)" | tee -a $LOG
|
||||
fi
|
||||
|
||||
sync -f "$ARCHIVE"
|
||||
sync -f "$ARCHIVE"
|
||||
sync -f "$ARCHIVE"
|
||||
|
||||
if [ -f "$ARCHIVE"/db/full.build.sqlite ]; then
|
||||
rm -f "$ARCHIVE"/db/full.sqlite
|
||||
cp "$ARCHIVE"/db/full.build.sqlite "$ARCHIVE"/db/full.sqlite
|
||||
sync -f "$ARCHIVE"
|
||||
sync -f "$ARCHIVE"
|
||||
sync -f "$ARCHIVE"
|
||||
date +'* Start analysis (%c)' | tee -a $LOG
|
||||
queries=`dirname $0`
|
||||
sqlite3 "$ARCHIVE"/db/full.sqlite < $queries/../queries/get_added_permissions.sql | tee -a $LOGPREFIX-analysis.log | /usr/bin/mail -s "Extension Analysis" root
|
||||
date +'* Analysis finished (%c)' | tee -a $LOG
|
||||
|
||||
date +'* Start Compressing full.build.sqlite Data Base (%c)' | tee -a $LOG
|
||||
pbzip2 -f "$ARCHIVE"/db/full.build.sqlite
|
||||
if [ $? -ne "0" ]; then
|
||||
rm -f $ARCHIVE/db/full.build.*
|
||||
echo " Creation of full.sqlite.bz2 failed" | tee -a $LOG
|
||||
else
|
||||
mv $ARCHIVE/db/full.build.sqlite.bz2 $ARCHIVE/db/full.sqlite.bz2
|
||||
SIZE=`du -k $ARCHIVE/db/full.sqlite.bz2 | cut -f1`
|
||||
echo " Created full.sqlite.bz2 successfully ($SIZE kb)" | tee -a $LOG
|
||||
fi
|
||||
fi
|
||||
|
||||
# date +'* Start Compressing Log files (%c)'
|
||||
# for f in $ARCHIVE/log/*.log; do
|
||||
# pbzip2 -f $f &> /dev/null
|
||||
# done
|
||||
|
||||
date +'* Update Finished (%c)' | tee -a $LOG
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sqlite3
|
||||
import sys
|
||||
import os
|
||||
import fnmatch
|
||||
|
||||
MAX_ATTACHED_DBS = 10
|
||||
|
||||
|
||||
def merge_and_detach(con, currently_attached, tablenames):
|
||||
for db in currently_attached:
|
||||
for tablename in tablenames:
|
||||
con.execute(
|
||||
"INSERT OR IGNORE INTO {tablename} SELECT * from {dbname}.{tablename};".
|
||||
format(tablename=tablename, dbname=db))
|
||||
con.commit()
|
||||
|
||||
con.commit()
|
||||
for db in currently_attached:
|
||||
con.execute("DETACH DATABASE {}".format(db))
|
||||
|
||||
|
||||
def get_tablenames(dbpath):
|
||||
with sqlite3.connect(dbpath) as con:
|
||||
create_strings = con.execute(
|
||||
"""select name from sqlite_master where type='table'"""
|
||||
""" and name NOT LIKE '%!_fts%' escape '!';""").fetchall()
|
||||
return [x[0] for x in create_strings]
|
||||
|
||||
|
||||
def merge_schema(con, dbpath):
|
||||
con.execute("ATTACH DATABASE ? as schemadb;", (dbpath, ))
|
||||
create_strings = con.execute(
|
||||
"""select sql from schemadb.sqlite_master where """
|
||||
"""(type='table' AND name NOT LIKE '%!_fts!_%' escape '!') OR """
|
||||
"""type='trigger';""")
|
||||
|
||||
for (create_string, ) in create_strings:
|
||||
print(create_string)
|
||||
con.execute(create_string)
|
||||
|
||||
con.execute("DETACH DATABASE schemadb;")
|
||||
|
||||
|
||||
def find(pattern, path):
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, pattern):
|
||||
yield os.path.join(root, name)
|
||||
|
||||
|
||||
def help():
|
||||
print("Usage: merge_dbs DBSPATH TODB")
|
||||
print(" DBSPATH the folder containing the *.sqlite files")
|
||||
print(" (searched recursivly)")
|
||||
print(" TODB the destination sqlite file")
|
||||
|
||||
|
||||
def main(argv):
|
||||
if len(argv) != 2:
|
||||
help()
|
||||
sys.exit(1)
|
||||
dbspath, todb = argv[:2]
|
||||
|
||||
print("Using sqlite3 version {}".format(sqlite3.sqlite_version))
|
||||
|
||||
if os.path.isdir(dbspath):
|
||||
sqlitepaths = list(find("*.sqlite", dbspath))
|
||||
else:
|
||||
sqlitepaths = [dbspath]
|
||||
|
||||
firstdb = sqlitepaths[0]
|
||||
tablenames = get_tablenames(firstdb)
|
||||
|
||||
with sqlite3.connect(todb) as con:
|
||||
if con.execute("SELECT COUNT(*) FROM sqlite_master;").fetchone()[
|
||||
0] == 0:
|
||||
print("Merging schema from {}".format(firstdb))
|
||||
merge_schema(con, firstdb)
|
||||
|
||||
con.execute("PRAGMA default_temp_store=2;")
|
||||
con.execute("PRAGMA cache_size=20000;")
|
||||
con.execute("PRAGMA temp_store=2;")
|
||||
|
||||
currently_attached = []
|
||||
for i, dbpath in enumerate(sqlitepaths):
|
||||
dbname = "db{}".format(i)
|
||||
print("Attaching {}".format(dbpath))
|
||||
con.execute("ATTACH DATABASE ? as ?", (dbpath, dbname))
|
||||
currently_attached += [dbname]
|
||||
if len(currently_attached) % MAX_ATTACHED_DBS == 0 or i + 1 == len(
|
||||
sqlitepaths):
|
||||
merge_and_detach(con, currently_attached, tablenames)
|
||||
currently_attached = []
|
||||
con.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
|
@ -1,19 +0,0 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
#$ -j yes
|
||||
#
|
||||
# Example invocation:
|
||||
# qsub merge_dbs DBDIR OUTDBPATH EXTENSIONCRAWLERPATH
|
||||
# DBDIR base dir for the generated db files
|
||||
# OUTDBPATH the generated full db
|
||||
# EXTENSIONCRAWLERPATH ExtensionCrawler git repo
|
||||
DBDIR="$1"
|
||||
OUTDBPATH="$2"
|
||||
EXTENSIONCRAWLERDIR="$3"
|
||||
|
||||
export PATH=~/bin:$PATH
|
||||
export LD_LIBRARY_PATH=~/lib:${LD_LIBRARY_PATH:-}
|
||||
|
||||
set -u
|
||||
|
||||
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$DBDIR" "$OUTDBPATH"
|
|
@ -1,33 +0,0 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
#$ -t 1-256
|
||||
#$ -j yes
|
||||
#
|
||||
# Example invocation:
|
||||
# qsub merge_each_db DBDIR OUTDBPATH EXTENSIONCRAWLERPATH
|
||||
# DBDIR base dir for the generated db files
|
||||
# OUTDBPATH path containing the three-letter-dirs
|
||||
# EXTENSIONCRAWLERPATH ExtensionCrawler git repo
|
||||
DBDIR="$1"
|
||||
OUTDBPATH="$2"
|
||||
EXTENSIONCRAWLERDIR="$3"
|
||||
|
||||
module -s load apps/python/conda 2> /dev/null
|
||||
source activate mypython35
|
||||
|
||||
export PATH=~/bin:$PATH
|
||||
export LD_LIBRARY_PATH=~/lib:${LD_LIBRARY_PATH:-}
|
||||
|
||||
function task_id_to_letter_256 {
|
||||
ABC=abcdefghijklmnopqrstuvwxyz
|
||||
let "I1 = (($1-1) / 16) % 16"
|
||||
let "I2 = ($1-1) % 16"
|
||||
echo ${ABC:$I1:1}${ABC:$I2:1}
|
||||
}
|
||||
|
||||
set -u
|
||||
|
||||
find "$DBDIR" -name "$(task_id_to_letter_256 $SGE_TASK_ID)*.sqlite" -print0 | while IFS= read -r -d '' file; do
|
||||
DBNAME=$(basename "$file")
|
||||
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$file" "$OUTDBPATH/${DBNAME:0:3}/$DBNAME"
|
||||
done
|
|
@ -14,7 +14,7 @@ ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
|
|||
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/out
|
||||
|
||||
echo "Pushing $BASEDIR to sharc.shef.ac.uk:$TARGETDIR/ExtensionCrawler ..."
|
||||
rsync -zr "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"
|
||||
rsync -zr --exclude "$BASEDIR/archive" "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"
|
||||
|
||||
echo "Starting job ..."
|
||||
ssh sharc.shef.ac.uk \
|
||||
|
|
Loading…
Reference in New Issue