Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

This commit is contained in:
Achim D. Brucker 2017-09-15 20:21:45 +01:00
commit 400e74ae3f
3 changed files with 43 additions and 38 deletions

View File

@ -420,42 +420,40 @@ def update_db_incremental(tmptardir, ext_id, date):
if etag:
try:
parse_and_insert_crx(ext_id, date, datepath, con)
except zipfile.BadZipfile as e:
log_warning(
"* WARNING: the found crx file is not a zip file, exception: {}".
format(str(e)), 3, ext_id)
except Exception as e:
log_exception("Exception when parsing crx", 3, ext_id)
else:
crx_status = get_crx_status(datepath)
if crx_status != 401 and crx_status != 204 and crx_status != 404:
log_warning("* WARNING: could not find etag", 3, ext_id)
parse_and_insert_overview(ext_id, date, datepath, con)
parse_and_insert_status(ext_id, date, datepath, con)
try:
parse_and_insert_overview(ext_id, date, datepath, con)
except Exception as e:
log_exception("Exception when parsing overview", 3, ext_id)
try:
parse_and_insert_status(ext_id, date, datepath, con)
except Exception as e:
log_exception("Exception when parsing status", 3, ext_id)
reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
for reviewpath in reviewpaths:
try:
parse_and_insert_review(ext_id, date, reviewpath, con)
except json.decoder.JSONDecodeError as e:
log_warning(
"* Could not parse review file, exception: {}".format(
str(e)), 3, ext_id)
except Exception as e:
log_exception("Exception when parsing review", 3, ext_id)
supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
for supportpath in supportpaths:
try:
parse_and_insert_support(ext_id, date, supportpath, con)
except json.decoder.JSONDecodeError as e:
log_warning(
"* Could not parse support file, exception: {}".format(
str(e)), 3, ext_id)
except Exception as e:
log_exception("Exception when parsing support", 3, ext_id)
repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
for repliespath in repliespaths:
try:
parse_and_insert_replies(ext_id, date, repliespath, con)
except json.decoder.JSONDecodeError as e:
log_warning(
"* Could not parse reply file, exception: {}".format(
str(e)), 3, ext_id)
con.commit()
except Exception as e:
log_exception("Exception when parsing reply", 3, ext_id)

View File

@ -52,6 +52,9 @@ class MysqlBackend:
if db is not None:
db.close()
db = None
db = MySQLdb.connect(**self.dbargs)
db.autocommit = True
self.cursor = db.cursor()
except Exception as e2:
log_error("Surpressed exception: {}".format(str(e2)), 3, self.ext_id)
raise last_exception
@ -72,6 +75,7 @@ class MysqlBackend:
global db
if db is None:
db = MySQLdb.connect(**self.dbargs)
db.autocommit = True
self.cursor = db.cursor()
return self
@ -84,9 +88,6 @@ class MysqlBackend:
except Exception as e:
log_error("Surpressed exception: {}".format(str(e)), 3, self.ext_id)
def commit(self):
db.commit()
def get_single_value(self, query, args):
self.retry(lambda: self.cursor.execute(query, args))

View File

@ -34,19 +34,21 @@ from ExtensionCrawler.util import log_info, log_warning, log_error, log_exceptio
def help():
print("create-db [OPTION]")
print(" -h print this help text")
print(" -a <DIR> archive directory")
print(" -p <PREFIX> three-letter-prefix")
print(" -e <EXTIDFILELIST> file with extension ids")
print(" --from-date <DATE> only process information gathered after this date")
print(" (compared lexographically)")
print(" -t <THREADS> number of parallel threads")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
print("""create-db [OPTION]""")
print(""" -h print this help text""")
print(""" -a <DIR> archive directory""")
print(""" -p <PREFIX> three-letter-prefix""")
print(""" -e <EXTIDFILELIST> file with extension ids""")
print(""" --from-date <DATE> only process information gathered after"""
""" this date (compared lexographically)""")
print(""" --until-date <DATE> only process information gathered before"""
""" this date (compared lexographically)""")
print(""" -t <THREADS> number of parallel threads""")
print(""" -n <TASKID> process chunk n where n in [1,N]""")
print(""" -N <MAXTASKID> """)
def process_id(from_date, path):
def process_id(from_date, until_date, path):
start = time.time()
with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(path) as t:
@ -57,7 +59,8 @@ def process_id(from_date, path):
iddir = os.path.join(tmpdir, extid)
for date in sorted(os.listdir(iddir)):
if from_date is not None and date < from_date:
if (from_date is not None and date < from_date) or \
(until_date is not None and date > until_date):
log_info("* Skipping {}".format(date), 2, extid)
continue
try:
@ -97,13 +100,14 @@ def parse_args(argv):
taskid = 1
maxtaskid = 1
from_date = None
until_date = None
paths = []
try:
opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [
"archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",
"maxtaskid=", "from-date=", "help"
"maxtaskid=", "from-date=", "until-date=", "help"
])
except getopt.GetoptError:
help()
@ -126,6 +130,8 @@ def parse_args(argv):
maxtaskid = int(arg)
elif opt in ("--from-date"):
from_date = arg
elif opt in ("--until-date"):
until_date = arg
if paths == []:
paths = list(find(archive, "*"))
@ -136,16 +142,16 @@ def parse_args(argv):
else:
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
return paths, parallel, from_date
return paths, parallel, from_date, until_date
def main(argv):
logging.basicConfig(level=logging.INFO, format=const_log_format())
paths, parallel, from_date = parse_args(argv)
paths, parallel, from_date, until_date = parse_args(argv)
with Pool(processes=parallel) as p:
p.map(partial(process_id, from_date), paths)
p.map(partial(process_id, from_date, until_date), paths)
if __name__ == "__main__":