Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler
This commit is contained in:
commit
400e74ae3f
|
@ -420,42 +420,40 @@ def update_db_incremental(tmptardir, ext_id, date):
|
|||
if etag:
|
||||
try:
|
||||
parse_and_insert_crx(ext_id, date, datepath, con)
|
||||
except zipfile.BadZipfile as e:
|
||||
log_warning(
|
||||
"* WARNING: the found crx file is not a zip file, exception: {}".
|
||||
format(str(e)), 3, ext_id)
|
||||
except Exception as e:
|
||||
log_exception("Exception when parsing crx", 3, ext_id)
|
||||
else:
|
||||
crx_status = get_crx_status(datepath)
|
||||
if crx_status != 401 and crx_status != 204 and crx_status != 404:
|
||||
log_warning("* WARNING: could not find etag", 3, ext_id)
|
||||
|
||||
parse_and_insert_overview(ext_id, date, datepath, con)
|
||||
parse_and_insert_status(ext_id, date, datepath, con)
|
||||
try:
|
||||
parse_and_insert_overview(ext_id, date, datepath, con)
|
||||
except Exception as e:
|
||||
log_exception("Exception when parsing overview", 3, ext_id)
|
||||
|
||||
try:
|
||||
parse_and_insert_status(ext_id, date, datepath, con)
|
||||
except Exception as e:
|
||||
log_exception("Exception when parsing status", 3, ext_id)
|
||||
|
||||
reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
|
||||
for reviewpath in reviewpaths:
|
||||
try:
|
||||
parse_and_insert_review(ext_id, date, reviewpath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
log_warning(
|
||||
"* Could not parse review file, exception: {}".format(
|
||||
str(e)), 3, ext_id)
|
||||
except Exception as e:
|
||||
log_exception("Exception when parsing review", 3, ext_id)
|
||||
|
||||
supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
|
||||
for supportpath in supportpaths:
|
||||
try:
|
||||
parse_and_insert_support(ext_id, date, supportpath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
log_warning(
|
||||
"* Could not parse support file, exception: {}".format(
|
||||
str(e)), 3, ext_id)
|
||||
except Exception as e:
|
||||
log_exception("Exception when parsing support", 3, ext_id)
|
||||
|
||||
repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
|
||||
for repliespath in repliespaths:
|
||||
try:
|
||||
parse_and_insert_replies(ext_id, date, repliespath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
log_warning(
|
||||
"* Could not parse reply file, exception: {}".format(
|
||||
str(e)), 3, ext_id)
|
||||
con.commit()
|
||||
except Exception as e:
|
||||
log_exception("Exception when parsing reply", 3, ext_id)
|
||||
|
|
|
@ -52,6 +52,9 @@ class MysqlBackend:
|
|||
if db is not None:
|
||||
db.close()
|
||||
db = None
|
||||
db = MySQLdb.connect(**self.dbargs)
|
||||
db.autocommit = True
|
||||
self.cursor = db.cursor()
|
||||
except Exception as e2:
|
||||
log_error("Surpressed exception: {}".format(str(e2)), 3, self.ext_id)
|
||||
raise last_exception
|
||||
|
@ -72,6 +75,7 @@ class MysqlBackend:
|
|||
global db
|
||||
if db is None:
|
||||
db = MySQLdb.connect(**self.dbargs)
|
||||
db.autocommit = True
|
||||
self.cursor = db.cursor()
|
||||
|
||||
return self
|
||||
|
@ -84,9 +88,6 @@ class MysqlBackend:
|
|||
except Exception as e:
|
||||
log_error("Surpressed exception: {}".format(str(e)), 3, self.ext_id)
|
||||
|
||||
def commit(self):
|
||||
db.commit()
|
||||
|
||||
def get_single_value(self, query, args):
|
||||
self.retry(lambda: self.cursor.execute(query, args))
|
||||
|
||||
|
|
38
create-db
38
create-db
|
@ -34,19 +34,21 @@ from ExtensionCrawler.util import log_info, log_warning, log_error, log_exceptio
|
|||
|
||||
|
||||
def help():
|
||||
print("create-db [OPTION]")
|
||||
print(" -h print this help text")
|
||||
print(" -a <DIR> archive directory")
|
||||
print(" -p <PREFIX> three-letter-prefix")
|
||||
print(" -e <EXTIDFILELIST> file with extension ids")
|
||||
print(" --from-date <DATE> only process information gathered after this date")
|
||||
print(" (compared lexographically)")
|
||||
print(" -t <THREADS> number of parallel threads")
|
||||
print(" -n <TASKID> process chunk n where n in [1,N]")
|
||||
print(" -N <MAXTASKID> ")
|
||||
print("""create-db [OPTION]""")
|
||||
print(""" -h print this help text""")
|
||||
print(""" -a <DIR> archive directory""")
|
||||
print(""" -p <PREFIX> three-letter-prefix""")
|
||||
print(""" -e <EXTIDFILELIST> file with extension ids""")
|
||||
print(""" --from-date <DATE> only process information gathered after"""
|
||||
""" this date (compared lexographically)""")
|
||||
print(""" --until-date <DATE> only process information gathered before"""
|
||||
""" this date (compared lexographically)""")
|
||||
print(""" -t <THREADS> number of parallel threads""")
|
||||
print(""" -n <TASKID> process chunk n where n in [1,N]""")
|
||||
print(""" -N <MAXTASKID> """)
|
||||
|
||||
|
||||
def process_id(from_date, path):
|
||||
def process_id(from_date, until_date, path):
|
||||
start = time.time()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with tarfile.open(path) as t:
|
||||
|
@ -57,7 +59,8 @@ def process_id(from_date, path):
|
|||
iddir = os.path.join(tmpdir, extid)
|
||||
|
||||
for date in sorted(os.listdir(iddir)):
|
||||
if from_date is not None and date < from_date:
|
||||
if (from_date is not None and date < from_date) or \
|
||||
(until_date is not None and date > until_date):
|
||||
log_info("* Skipping {}".format(date), 2, extid)
|
||||
continue
|
||||
try:
|
||||
|
@ -97,13 +100,14 @@ def parse_args(argv):
|
|||
taskid = 1
|
||||
maxtaskid = 1
|
||||
from_date = None
|
||||
until_date = None
|
||||
|
||||
paths = []
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [
|
||||
"archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",
|
||||
"maxtaskid=", "from-date=", "help"
|
||||
"maxtaskid=", "from-date=", "until-date=", "help"
|
||||
])
|
||||
except getopt.GetoptError:
|
||||
help()
|
||||
|
@ -126,6 +130,8 @@ def parse_args(argv):
|
|||
maxtaskid = int(arg)
|
||||
elif opt in ("--from-date"):
|
||||
from_date = arg
|
||||
elif opt in ("--until-date"):
|
||||
until_date = arg
|
||||
|
||||
if paths == []:
|
||||
paths = list(find(archive, "*"))
|
||||
|
@ -136,16 +142,16 @@ def parse_args(argv):
|
|||
else:
|
||||
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
|
||||
|
||||
return paths, parallel, from_date
|
||||
return paths, parallel, from_date, until_date
|
||||
|
||||
|
||||
def main(argv):
|
||||
logging.basicConfig(level=logging.INFO, format=const_log_format())
|
||||
|
||||
paths, parallel, from_date = parse_args(argv)
|
||||
paths, parallel, from_date, until_date = parse_args(argv)
|
||||
|
||||
with Pool(processes=parallel) as p:
|
||||
p.map(partial(process_id, from_date), paths)
|
||||
p.map(partial(process_id, from_date, until_date), paths)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in New Issue