Fixed style errors and warnings.
This commit is contained in:
parent
ac3c1c7f20
commit
a789fe505f
|
@ -23,7 +23,6 @@ import os
|
|||
import glob
|
||||
import re
|
||||
import json
|
||||
from multiprocessing import Pool
|
||||
from concurrent.futures import TimeoutError
|
||||
from pebble import ProcessPool, ProcessExpired
|
||||
from functools import partial
|
||||
|
@ -44,6 +43,7 @@ from ExtensionCrawler.config import (
|
|||
from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_warning, log_exception, setup_logger
|
||||
from ExtensionCrawler.db import update_db_incremental
|
||||
|
||||
|
||||
class Error(Exception):
|
||||
pass
|
||||
|
||||
|
@ -82,11 +82,11 @@ class RequestResult:
|
|||
|
||||
|
||||
class UpdateResult:
|
||||
def __init__(self, id, is_new, exception, res_overview, res_crx,
|
||||
def __init__(self, ext_id, is_new, exception, res_overview, res_crx,
|
||||
res_reviews, res_support, res_sql, sql_update, worker_exception=None):
|
||||
self.id = id
|
||||
self.ext_id = ext_id
|
||||
self.new = is_new
|
||||
self.exception = exception # TODO: should be tar_exception
|
||||
self.exception = exception # TODO: should be tar_exception
|
||||
self.res_overview = res_overview
|
||||
self.res_crx = res_crx
|
||||
self.res_reviews = res_reviews
|
||||
|
@ -188,7 +188,7 @@ def last_modified_http_date(path):
|
|||
|
||||
|
||||
def last_crx(archivedir, extid, date=None):
|
||||
last_crx = ""
|
||||
last_crx_path = ""
|
||||
last_crx_etag = ""
|
||||
|
||||
etag_file = os.path.join(archivedir, get_local_archive_dir(extid),
|
||||
|
@ -198,14 +198,13 @@ def last_crx(archivedir, extid, date=None):
|
|||
with open(etag_file, 'r') as f:
|
||||
d = json.load(f)
|
||||
return d["last_crx"], d["last_crx_etag"]
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
log_exception("Something was wrong with the etag file {}, deleting it ...".format(etag_file))
|
||||
try:
|
||||
os.remove(etag_file)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
log_exception("Could not remove etag file {}!".format(etag_file))
|
||||
|
||||
|
||||
# If we do not yet have an .etag file present, open the tarfile and look
|
||||
# there for one. After having done that once, the crawler creates the .etag
|
||||
# file to avoid opening the tar file in the future.
|
||||
|
@ -219,23 +218,23 @@ def last_crx(archivedir, extid, date=None):
|
|||
date is None or (dateutil.parser.parse(
|
||||
os.path.split(os.path.split(x.name)[0])[1]) <= date))
|
||||
])
|
||||
if old_crxs != []:
|
||||
last_crx = old_crxs[-1]
|
||||
if old_crxs:
|
||||
last_crx_path = old_crxs[-1]
|
||||
headers_content = t.extractfile(
|
||||
last_crx + ".headers").read().decode().replace(
|
||||
last_crx_path + ".headers").read().decode().replace(
|
||||
'"', '\\"').replace("'", '"')
|
||||
headers_json = json.loads(headers_content)
|
||||
last_crx_etag = headers_json["ETag"]
|
||||
|
||||
if date is None:
|
||||
with open(etag_file, 'w') as f:
|
||||
json.dump({"last_crx": last_crx, "last_crx_etag": last_crx_etag}, f)
|
||||
json.dump({"last_crx": last_crx_path, "last_crx_etag": last_crx_etag}, f)
|
||||
|
||||
return last_crx, last_crx_etag
|
||||
return last_crx_path, last_crx_etag
|
||||
|
||||
|
||||
def first_crx(archivedir, extid, date=None):
|
||||
first_crx = ""
|
||||
first_crx_path = ""
|
||||
tar = os.path.join(archivedir, get_local_archive_dir(extid),
|
||||
extid + ".tar")
|
||||
if os.path.exists(tar):
|
||||
|
@ -247,10 +246,10 @@ def first_crx(archivedir, extid, date=None):
|
|||
os.path.split(os.path.split(x.name)[0])[1])))
|
||||
])
|
||||
t.close()
|
||||
if old_crxs != []:
|
||||
first_crx = old_crxs[0]
|
||||
if old_crxs:
|
||||
first_crx_path = old_crxs[0]
|
||||
|
||||
return first_crx
|
||||
return first_crx_path
|
||||
|
||||
|
||||
def all_crx(archivedir, extid, date=None):
|
||||
|
@ -283,7 +282,7 @@ def update_overview(tar, date, ext_id):
|
|||
|
||||
def validate_crx_response(res, extid, extfilename):
|
||||
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
||||
if not 'Content-Type' in res.headers:
|
||||
if 'Content-Type' not in res.headers:
|
||||
raise CrawlError(extid, 'Did not find Content-Type header.', '\n'.join(
|
||||
res.iter_lines()))
|
||||
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
|
||||
|
@ -351,10 +350,12 @@ def update_crx(archivedir, tmptardir, ext_id, date):
|
|||
f.write(chunk)
|
||||
write_text(tmptardir, date, extfilename + ".etag",
|
||||
res.headers.get("ETag"))
|
||||
etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id),
|
||||
ext_id + ".etag")
|
||||
etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id + ".etag")
|
||||
with open(etag_file, 'w') as f:
|
||||
json.dump({"last_crx": os.path.join(ext_id, date, extfilename), "last_crx_etag": res.headers.get("ETag")}, f)
|
||||
json.dump({
|
||||
"last_crx": os.path.join(ext_id, date, extfilename),
|
||||
"last_crx_etag": res.headers.get("ETag")
|
||||
}, f)
|
||||
except Exception as e:
|
||||
log_exception("Exception when updating crx", 3, ext_id)
|
||||
write_text(tmptardir, date, extfilename + ".exception",
|
||||
|
@ -367,9 +368,10 @@ def iterate_authors(pages):
|
|||
for page in pages:
|
||||
json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
|
||||
for annotation in json_page["annotations"]:
|
||||
if "attributes" in annotation and "replyExists" in annotation["attributes"] and annotation["attributes"]["replyExists"]:
|
||||
yield (annotation["entity"]["author"],
|
||||
annotation["entity"]["groups"])
|
||||
if "attributes" in annotation:
|
||||
if "replyExists" in annotation["attributes"]:
|
||||
if annotation["attributes"]["replyExists"]:
|
||||
yield (annotation["entity"]["author"], annotation["entity"]["groups"])
|
||||
|
||||
|
||||
def update_reviews(tar, date, ext_id):
|
||||
|
@ -550,7 +552,7 @@ def update_extension(archivedir, forums, ext_id):
|
|||
try:
|
||||
write_text(tardir, date, ext_id + ".sql.exception",
|
||||
traceback.format_exc())
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
shutil.rmtree(path=tmpdir)
|
||||
|
@ -581,13 +583,11 @@ def init_process(verbose, start_pystuck=False):
|
|||
pystuck.run_server(port=((os.getpid() % 10000) + 10001))
|
||||
|
||||
|
||||
def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
|
||||
results=[]
|
||||
with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process, initargs=(verbose, start_pystuck)) as pool:
|
||||
future = pool.map(partial(update_extension, archivedir, forums),
|
||||
ext_ids,
|
||||
chunksize=1,
|
||||
timeout=timeout)
|
||||
def execute_parallel(archivedir, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
|
||||
results = []
|
||||
with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process,
|
||||
initargs=(verbose, start_pystuck)) as pool:
|
||||
future = pool.map(partial(update_extension, archivedir, forums), ext_ids, chunksize=1, timeout=timeout)
|
||||
iterator = future.result()
|
||||
for ext_id in ext_ids:
|
||||
try:
|
||||
|
@ -595,7 +595,7 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum
|
|||
except StopIteration:
|
||||
break
|
||||
except TimeoutError as error:
|
||||
log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id,error.args[1]))
|
||||
log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id, error.args[1]))
|
||||
results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error))
|
||||
except ProcessExpired as error:
|
||||
log_warning("WorkerException: %s (%s), exit code: %d" % (error, ext_id, error.exitcode))
|
||||
|
@ -609,8 +609,6 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum
|
|||
|
||||
|
||||
def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, verbose, start_pystuck):
|
||||
ext_with_forums = []
|
||||
ext_without_forums = []
|
||||
forums_ext_ids = (list(set(forums_ext_ids)))
|
||||
|
||||
log_info("Updating {} extensions ({} including forums)".format(
|
||||
|
@ -621,13 +619,13 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, ve
|
|||
parallel_ids = ext_ids
|
||||
log_info("Updating {} extensions excluding forums (parallel)".format(
|
||||
len(parallel_ids)), 1)
|
||||
ext_without_forums = execute_parallel(archivedir, 3, timeout, parallel, parallel_ids, False, verbose, start_pystuck)
|
||||
ext_without_forums = execute_parallel(archivedir, timeout, parallel, parallel_ids, False, verbose, start_pystuck)
|
||||
|
||||
# Second, update extensions with forums sequentially (and with delays) to
|
||||
# avoid running into Googles DDOS detection.
|
||||
log_info("Updating {} extensions including forums (sequentially)".format(
|
||||
len(forums_ext_ids)), 1)
|
||||
ext_with_forums = execute_parallel(archivedir, 3, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)
|
||||
ext_with_forums = execute_parallel(archivedir, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)
|
||||
|
||||
return ext_with_forums + ext_without_forums
|
||||
|
||||
|
|
|
@ -169,10 +169,10 @@ def update_lib(force, archive, lib):
|
|||
outphased = []
|
||||
for lib_ver in local_lib_json['assets']:
|
||||
version = lib_ver['version']
|
||||
if not version in cdnjs_versions:
|
||||
if version not in cdnjs_versions:
|
||||
logging.warning("Found outphased versions for " + name + " " +
|
||||
str(version) + " , preserving from archive.")
|
||||
if not 'outphased' in lib_ver:
|
||||
if 'outphased' not in lib_ver:
|
||||
lib_ver[
|
||||
'outphased'] = datetime.datetime.utcnow().isoformat()
|
||||
outphased.append(lib_ver)
|
||||
|
@ -260,7 +260,7 @@ def delete_orphaned(archive, local_libs, cdnjs_current_libs):
|
|||
"""Delete all orphaned local libaries."""
|
||||
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
|
||||
for lib in local_libs:
|
||||
if not lib in cdnjs_current_libs:
|
||||
if lib not in cdnjs_current_libs:
|
||||
os.remove(os.path.join(dirname, lib + ".json"))
|
||||
|
||||
|
||||
|
|
|
@ -25,8 +25,7 @@ import logging
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
from functools import partial, reduce
|
||||
from multiprocessing import Pool
|
||||
from functools import reduce
|
||||
|
||||
import dateutil.parser
|
||||
import git
|
||||
|
@ -70,8 +69,8 @@ def pull_list_changed_files(git_path):
|
|||
for diff in single_fetch_info.commit.diff(
|
||||
single_fetch_info.old_commit):
|
||||
logging.debug("Found diff: " + str(diff))
|
||||
if not diff.a_blob is None:
|
||||
if not diff.a_blob.path in files:
|
||||
if diff.a_blob is not None:
|
||||
if diff.a_blob.path not in files:
|
||||
files.append(diff.a_blob.path)
|
||||
return files
|
||||
|
||||
|
@ -98,7 +97,7 @@ def hackish_pull_list_changed_files(git_path):
|
|||
|
||||
for line in pull_lines:
|
||||
match = re.search(r'^ (.+) \| .*$', line)
|
||||
if not match is None:
|
||||
if match is not None:
|
||||
changed_files = match.group(1).split('=>')
|
||||
for changed_file in changed_files:
|
||||
files.add(changed_file.strip())
|
||||
|
@ -139,6 +138,7 @@ def get_file_libinfo(release_dic, git_path, libfile):
|
|||
file_info['library'] = lib
|
||||
file_info['version'] = version
|
||||
file_info['add_date'] = release_dic[(lib, version)]
|
||||
# TODO: why is package not used?
|
||||
package = os.path.join(
|
||||
reduce(os.path.join, plist[:idx + 1]), "package.json")
|
||||
return file_info
|
||||
|
@ -167,7 +167,7 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
|
|||
libvers = set()
|
||||
files = []
|
||||
versionidx = len(path_to_list(cdnjs_git_path)) + 4
|
||||
if not localpath is None:
|
||||
if localpath is not None:
|
||||
paths = os.path.join(cdnjs_git_path, localpath)
|
||||
else:
|
||||
paths = os.path.join(cdnjs_git_path, 'ajax/libs/**/*')
|
||||
|
@ -196,7 +196,7 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
|
|||
if os.path.isfile(filename):
|
||||
logging.info("Updating database for file " + filename)
|
||||
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
|
||||
if not file_info is None:
|
||||
if file_info is not None:
|
||||
if create_csv:
|
||||
print(file_info['path'])
|
||||
print(cdnjs_git_path)
|
||||
|
@ -268,7 +268,7 @@ def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
|
|||
retries = 0
|
||||
success = False
|
||||
max_retries = 4
|
||||
while (not success and (retries < max_retries)):
|
||||
while not success and (retries < max_retries):
|
||||
try:
|
||||
update_database_for_file_chunked_timeout(create_csv, release_dic,
|
||||
cdnjs_git_path, filenames)
|
||||
|
@ -305,7 +305,7 @@ def get_release_triple(git_path, libver):
|
|||
lib = plist[-2]
|
||||
date = get_add_date(git_path, libver)
|
||||
logging.info("Release information:" + lib + " " + ver + ": " + str(date))
|
||||
return (lib, ver, date)
|
||||
return lib, ver, date
|
||||
|
||||
|
||||
def build_release_date_dic(git_path, libvers):
|
||||
|
@ -332,7 +332,6 @@ def pull_and_update_db(cdnjs_git_path, create_csv):
|
|||
|
||||
def update_db_from_listfile(cdnjs_git_path, listfile, create_csv):
|
||||
"""Update database (without pull) for files in listfile)"""
|
||||
paths = []
|
||||
with open(listfile) as listfileobj:
|
||||
paths = listfileobj.read().splitlines()
|
||||
files = []
|
||||
|
|
|
@ -142,10 +142,12 @@ def const_verbose():
|
|||
"""Default verbosity."""
|
||||
return True
|
||||
|
||||
|
||||
def const_use_process_pool():
|
||||
"""Use ProcessPool (from module 'pebble') for concurrency."""
|
||||
return False
|
||||
|
||||
|
||||
def const_log_format():
|
||||
return '%(process)6s %(asctime)s %(levelname)8s %(message)s'
|
||||
|
||||
|
@ -154,14 +156,17 @@ def const_discover():
|
|||
"""Default configuration of discovery mode"""
|
||||
return False
|
||||
|
||||
|
||||
def const_download_ext_ids_with_forums():
|
||||
"""Download extensions with forums (sequential mode)"""
|
||||
return True
|
||||
|
||||
|
||||
def const_download_ext_ids_without_forums():
|
||||
"""Download extensions without forums (parallel mode)"""
|
||||
return True
|
||||
|
||||
|
||||
def const_ext_timeout():
|
||||
"""Timeout for downloading an individual extension (2 hours)."""
|
||||
return 2*60*60
|
||||
|
|
|
@ -15,13 +15,12 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
from ExtensionCrawler.config import *
|
||||
from ExtensionCrawler.util import *
|
||||
from ExtensionCrawler.crx import *
|
||||
from ExtensionCrawler.archive import *
|
||||
from ExtensionCrawler.js_decomposer import decompose_js_with_connection, DetectionType, FileClassification
|
||||
from ExtensionCrawler.config import const_mysql_config_file
|
||||
from ExtensionCrawler.crx import read_crx
|
||||
from ExtensionCrawler.js_decomposer import decompose_js_with_connection
|
||||
from ExtensionCrawler.util import log_warning, log_debug, log_exception, log_info
|
||||
|
||||
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
|
||||
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend, convert_date
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -63,7 +62,7 @@ def get_etag(ext_id, datepath, con):
|
|||
link = f.read()
|
||||
linked_date = link[3:].split("/")[0]
|
||||
|
||||
result = con.get_etag(ext_id, con.convert_date(linked_date))
|
||||
result = con.get_etag(ext_id, convert_date(linked_date))
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
|
@ -166,7 +165,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
|
|||
con.insert(
|
||||
"extension",
|
||||
extid=ext_id,
|
||||
date=con.convert_date(date),
|
||||
date=convert_date(date),
|
||||
name=name,
|
||||
version=version,
|
||||
description=description,
|
||||
|
@ -184,12 +183,12 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
|
|||
con.insert(
|
||||
"category",
|
||||
extid=ext_id,
|
||||
date=con.convert_date(date),
|
||||
date=convert_date(date),
|
||||
category_md5=hashlib.md5(category.encode()).digest(),
|
||||
category=category)
|
||||
|
||||
|
||||
def parse_and_insert_crx(ext_id, date, datepath, con):
|
||||
def parse_and_insert_crx(ext_id, datepath, con):
|
||||
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
|
||||
if not crx_path:
|
||||
return
|
||||
|
@ -314,7 +313,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con):
|
|||
con.insert(
|
||||
"review",
|
||||
extid=ext_id,
|
||||
date=con.convert_date(date),
|
||||
date=convert_date(date),
|
||||
commentdate=datetime.datetime.utcfromtimestamp(
|
||||
get(review, "timestamp")).isoformat()
|
||||
if "timestamp" in review else None,
|
||||
|
@ -345,7 +344,7 @@ def parse_and_insert_support(ext_id, date, supportpath, con):
|
|||
con.insert(
|
||||
"support",
|
||||
extid=ext_id,
|
||||
date=con.convert_date(date),
|
||||
date=convert_date(date),
|
||||
commentdate=datetime.datetime.utcfromtimestamp(
|
||||
get(review, "timestamp")).isoformat()
|
||||
if "timestamp" in review else None,
|
||||
|
@ -365,7 +364,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
|
|||
log_debug("- parsing reply file", 3, ext_id)
|
||||
with open(repliespath) as f:
|
||||
d = json.load(f)
|
||||
if not "searchResults" in d:
|
||||
if "searchResults" not in d:
|
||||
log_warning("* WARNING: there are no search results in {}".format(
|
||||
repliespath), 3, ext_id)
|
||||
return
|
||||
|
@ -379,7 +378,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
|
|||
con.insert(
|
||||
"reply",
|
||||
extid=ext_id,
|
||||
date=con.convert_date(date),
|
||||
date=convert_date(date),
|
||||
commentdate=datetime.datetime.utcfromtimestamp(
|
||||
get(annotation, "timestamp")).isoformat()
|
||||
if "timestamp" in annotation else None,
|
||||
|
@ -413,7 +412,7 @@ def parse_and_insert_status(ext_id, date, datepath, con):
|
|||
con.insert(
|
||||
"status",
|
||||
extid=ext_id,
|
||||
date=con.convert_date(date),
|
||||
date=convert_date(date),
|
||||
crx_status=crx_status,
|
||||
overview_status=overview_status,
|
||||
overview_exception=overview_exception)
|
||||
|
@ -439,8 +438,8 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):
|
|||
|
||||
if etag:
|
||||
try:
|
||||
parse_and_insert_crx(ext_id, date, datepath, con)
|
||||
except Exception as e:
|
||||
parse_and_insert_crx(ext_id, datepath, con)
|
||||
except Exception:
|
||||
log_exception("Exception when parsing crx", 3, ext_id)
|
||||
else:
|
||||
crx_status = get_crx_status(datepath)
|
||||
|
@ -449,40 +448,40 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):
|
|||
|
||||
try:
|
||||
parse_and_insert_overview(ext_id, date, datepath, con)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
log_exception("Exception when parsing overview", 3, ext_id)
|
||||
|
||||
try:
|
||||
parse_and_insert_status(ext_id, date, datepath, con)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
log_exception("Exception when parsing status", 3, ext_id)
|
||||
|
||||
reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
|
||||
for reviewpath in reviewpaths:
|
||||
try:
|
||||
parse_and_insert_review(ext_id, date, reviewpath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
except json.decoder.JSONDecodeError:
|
||||
log_warning("- WARNING: Review is not a proper json file!", 3,
|
||||
ext_id)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
log_exception("Exception when parsing review", 3, ext_id)
|
||||
|
||||
supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
|
||||
for supportpath in supportpaths:
|
||||
try:
|
||||
parse_and_insert_support(ext_id, date, supportpath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
except json.decoder.JSONDecodeError:
|
||||
log_warning("- WARNING: Support is not a proper json file!", 3,
|
||||
ext_id)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
log_exception("Exception when parsing support", 3, ext_id)
|
||||
|
||||
repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
|
||||
for repliespath in repliespaths:
|
||||
try:
|
||||
parse_and_insert_replies(ext_id, date, repliespath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
except json.decoder.JSONDecodeError:
|
||||
log_warning("- WARNING: Reply is not a proper json file!", 3,
|
||||
ext_id)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
log_exception("Exception when parsing reply", 3, ext_id)
|
||||
|
|
|
@ -18,18 +18,17 @@
|
|||
import time
|
||||
import datetime
|
||||
from random import uniform
|
||||
from itertools import starmap
|
||||
import logging
|
||||
|
||||
import MySQLdb
|
||||
import _mysql_exceptions
|
||||
|
||||
import ExtensionCrawler.config as config
|
||||
from ExtensionCrawler.util import log_info, log_error, log_exception, log_warning
|
||||
from ExtensionCrawler.util import log_info, log_error, log_warning
|
||||
|
||||
|
||||
class MysqlBackend:
|
||||
def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(), **kwargs):
|
||||
def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(),
|
||||
**kwargs):
|
||||
self.ext_id = ext_id
|
||||
self.dbargs = kwargs
|
||||
self.try_wait = try_wait
|
||||
|
@ -147,5 +146,6 @@ class MysqlBackend:
|
|||
result = self.retry(lambda: self.cursor.fetchone())
|
||||
return result
|
||||
|
||||
def convert_date(self, date):
|
||||
return date[:-6]
|
||||
|
||||
def convert_date(date):
|
||||
return date[:-6]
|
||||
|
|
|
@ -17,12 +17,10 @@
|
|||
"""Python mnodule providing methods for discovering extensions in the
|
||||
Chrome extension store."""
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.etree.ElementTree import fromstring
|
||||
import re
|
||||
from functools import reduce
|
||||
import requests
|
||||
from ExtensionCrawler import config
|
||||
from ExtensionCrawler.util import log_info, log_exception
|
||||
|
||||
|
||||
def crawl_nearly_all_of_ext_ids():
|
||||
|
@ -30,7 +28,7 @@ def crawl_nearly_all_of_ext_ids():
|
|||
|
||||
def get_inner_elems(doc):
|
||||
"""Get inner element."""
|
||||
return ET.fromstring(doc).iterfind(r".//{{{}}}loc".format(
|
||||
return fromstring(doc).iterfind(r".//{{{}}}loc".format(
|
||||
config.const_sitemap_scheme()))
|
||||
|
||||
def is_generic_url(url):
|
||||
|
|
|
@ -30,12 +30,14 @@ import magic
|
|||
|
||||
from ExtensionCrawler.js_mincer import mince_js
|
||||
|
||||
|
||||
def is_binary_resource(mimetype_magic):
|
||||
return (mimetype_magic.startswith("image/") or
|
||||
mimetype_magic.startswith("video/") or
|
||||
mimetype_magic.startswith("audio/") or
|
||||
mimetype_magic == "application/pdf")
|
||||
|
||||
|
||||
def normalize_jsdata(str_data):
|
||||
"""Compute normalized code blocks of a JavaScript file"""
|
||||
txt = ""
|
||||
|
@ -59,9 +61,8 @@ def get_features(s):
|
|||
|
||||
def get_simhash(encoding, data):
|
||||
"""Compute simhash of text."""
|
||||
str_data = ""
|
||||
if not encoding is None:
|
||||
str_data = data.decode(encoding=encoding,errors="replace")
|
||||
if encoding is not None:
|
||||
str_data = data.decode(encoding=encoding, errors="replace")
|
||||
else:
|
||||
str_data = str(data)
|
||||
simhash = Simhash(get_features(str_data)).value
|
||||
|
@ -82,31 +83,30 @@ def compute_difference(hx, hy):
|
|||
def get_data_identifiers(data):
|
||||
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
|
||||
|
||||
data_identifier = {}
|
||||
|
||||
data_identifier['encoding'] = None
|
||||
data_identifier['description'] = None
|
||||
data_identifier['size'] = None
|
||||
data_identifier['loc'] = None
|
||||
data_identifier['mimetype_magic'] = None
|
||||
data_identifier['md5'] = None
|
||||
data_identifier['sha1'] = None
|
||||
data_identifier['sha256'] = None
|
||||
data_identifier['simhash'] = None
|
||||
data_identifier['size_stripped'] = None
|
||||
data_identifier['normalized_encoding'] = None
|
||||
data_identifier['normalized_description'] = None
|
||||
data_identifier['normalized_size'] = None
|
||||
data_identifier['normalized_loc'] = None
|
||||
data_identifier['normalized_mimetype_magic'] = None
|
||||
data_identifier['normalized_md5'] = None
|
||||
data_identifier['normalized_sha1'] = None
|
||||
data_identifier['normalized_sha256'] = None
|
||||
data_identifier['normalized_simhash'] = None
|
||||
data_identifier = {
|
||||
'encoding': None,
|
||||
'description': None,
|
||||
'size': None,
|
||||
'loc': None,
|
||||
'mimetype_magic': None,
|
||||
'md5': None,
|
||||
'sha1': None,
|
||||
'sha256': None,
|
||||
'simhash': None,
|
||||
'size_stripped': None,
|
||||
'normalized_encoding': None,
|
||||
'normalized_description': None,
|
||||
'normalized_size': None,
|
||||
'normalized_loc': None,
|
||||
'normalized_mimetype_magic': None,
|
||||
'normalized_md5': None,
|
||||
'normalized_sha1': None,
|
||||
'normalized_sha256': None,
|
||||
'normalized_simhash': None
|
||||
}
|
||||
|
||||
mimetype_magic = magic.from_buffer(data, mime=True)
|
||||
|
||||
magic_desc = ""
|
||||
try:
|
||||
magic_desc = magic.from_buffer(data)
|
||||
except magic.MagicException as exp:
|
||||
|
@ -137,9 +137,10 @@ def get_data_identifiers(data):
|
|||
data_identifier['encoding'] = encoding
|
||||
try:
|
||||
normalized_data, normalized_loc = normalize_jsdata(
|
||||
data.decode(encoding=data_identifier['encoding'],errors="replace"))
|
||||
data.decode(encoding=data_identifier['encoding'], errors="replace"))
|
||||
except Exception:
|
||||
normalized_data = None
|
||||
normalized_loc = 0
|
||||
|
||||
if normalized_data is not None:
|
||||
normalized_magic_desc = ""
|
||||
|
@ -149,7 +150,7 @@ def get_data_identifiers(data):
|
|||
rgx = re.compile(r' name use count.*$')
|
||||
msg = str(exp.message)
|
||||
if re.search(rgx, msg):
|
||||
magic_desc = re.sub(rgx, '', msg)
|
||||
normalized_magic_desc = re.sub(rgx, '', msg)
|
||||
else:
|
||||
raise exp
|
||||
normalized_encoding = chardet.detect(normalized_data)['encoding']
|
||||
|
|
|
@ -18,12 +18,10 @@
|
|||
general and Chrome extensions in particular."""
|
||||
|
||||
import os
|
||||
import io
|
||||
from io import StringIO
|
||||
import re
|
||||
import json
|
||||
import zlib
|
||||
import logging
|
||||
from enum import Enum
|
||||
from ExtensionCrawler.js_mincer import mince_js
|
||||
from ExtensionCrawler.file_identifiers import get_file_identifiers, is_binary_resource
|
||||
|
@ -107,15 +105,15 @@ def unknown_lib_identifiers():
|
|||
re.compile(
|
||||
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
||||
re.IGNORECASE
|
||||
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
|
||||
), # MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
|
||||
re.compile(
|
||||
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
|
||||
re.IGNORECASE
|
||||
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
|
||||
), # MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
|
||||
re.compile(
|
||||
r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
||||
re.IGNORECASE
|
||||
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
|
||||
), # MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
|
||||
re.compile(
|
||||
r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
|
||||
re.IGNORECASE),
|
||||
|
@ -188,13 +186,9 @@ def check_md5_decompressed(con, file_info):
|
|||
"""Check for known md5 hash (decompressed file content)."""
|
||||
if con is None:
|
||||
return file_info
|
||||
if file_info['dec_md5'] is None:
|
||||
return file_info
|
||||
else:
|
||||
if file_info['dec_md5'] is not None:
|
||||
libver = con.get_cdnjs_info(file_info['dec_md5'])
|
||||
if libver is None:
|
||||
return file_info
|
||||
else:
|
||||
if libver is not None:
|
||||
file_info['lib'] = libver[0]
|
||||
file_info['version'] = libver[1]
|
||||
file_info['lib_filename'] = libver[2]
|
||||
|
@ -203,7 +197,6 @@ def check_md5_decompressed(con, file_info):
|
|||
else:
|
||||
file_info['type'] = FileClassification.LIBRARY
|
||||
file_info['detectionMethod'] = DetectionType.MD5_DECOMPRESSED
|
||||
return file_info
|
||||
return file_info
|
||||
|
||||
|
||||
|
@ -361,7 +354,7 @@ def analyse_comment_known_libs(zipfile, js_file, js_info, comment):
|
|||
else:
|
||||
filename = js_file
|
||||
for lib, regex in load_lib_identifiers().items():
|
||||
if ('filecontent' in regex):
|
||||
if 'filecontent' in regex:
|
||||
for unkregex in regex['filecontent']:
|
||||
unkown_lib_matched = unkregex.finditer(comment.content)
|
||||
for match in unkown_lib_matched:
|
||||
|
@ -481,13 +474,14 @@ def decompose_js_with_connection(path_or_zipfileobj, con):
|
|||
try:
|
||||
str_data = data.decode(file_info['encoding'])
|
||||
except Exception:
|
||||
log_info("Exception during data decoding for entry " +
|
||||
file_info['filename'], 3)
|
||||
log_info("Exception during data decoding for entry " + file_info['filename'], 3)
|
||||
str_data = ''
|
||||
else:
|
||||
str_data = ''
|
||||
|
||||
info_data_blocks = check_data_blocks(file_info, str_data)
|
||||
else:
|
||||
info_data_blocks = None
|
||||
|
||||
if info_data_blocks:
|
||||
inventory = inventory + merge_filename_and_data_info(
|
||||
|
|
|
@ -198,8 +198,8 @@ def mince_js_fileobj(fileobj):
|
|||
except StopIteration:
|
||||
pass
|
||||
|
||||
if ((is_comment(state) and is_code_or_string_literal(suc_state)) or
|
||||
(is_code_or_string_literal(state) and is_comment(suc_state))):
|
||||
if ((is_comment(state) and is_code_or_string_literal(suc_state)) or (
|
||||
is_code_or_string_literal(state) and is_comment(suc_state))):
|
||||
if content.strip():
|
||||
yield (JsBlock(state, (block_start_line, block_start_cpos),
|
||||
(line, cpos), content, string_literals))
|
||||
|
|
|
@ -107,7 +107,7 @@ def main(argv):
|
|||
logging.info("Starting update of new db libs")
|
||||
pull_and_update_db(cdnjs_git_path, csv)
|
||||
logging.info("Finished update of new db libs")
|
||||
if not listfile is None:
|
||||
if listfile is not None:
|
||||
logging.info("Starting update from list file")
|
||||
update_db_from_listfile(cdnjs_git_path, listfile, csv)
|
||||
logging.info("Finished update from list file")
|
||||
|
|
14
crawler
14
crawler
|
@ -19,7 +19,6 @@
|
|||
A crawler for extensions from the Chrome Web Store.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import time
|
||||
|
@ -141,7 +140,7 @@ def log_summary(res, runtime=0):
|
|||
log_info(" Total runtime: {}".format(
|
||||
str(datetime.timedelta(seconds=int(runtime)))))
|
||||
|
||||
if corrupt_tar_archives != []:
|
||||
if corrupt_tar_archives:
|
||||
log_info("")
|
||||
log_info("List of extensions with corrupted files/archives:")
|
||||
list(
|
||||
|
@ -229,7 +228,8 @@ def parse_args(argv):
|
|||
max_discover = int(arg)
|
||||
elif opt == '--pystuck':
|
||||
start_pystuck = True
|
||||
return basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck
|
||||
return [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums,
|
||||
download_ext_ids_without_forums, ext_timeout, start_pystuck]
|
||||
|
||||
|
||||
def main(argv):
|
||||
|
@ -242,8 +242,8 @@ def main(argv):
|
|||
multiprocessing.set_start_method("forkserver")
|
||||
|
||||
today = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
||||
basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck = parse_args(
|
||||
argv)
|
||||
[basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums,
|
||||
ext_timeout, start_pystuck] = parse_args(argv)
|
||||
|
||||
setup_logger(verbose)
|
||||
|
||||
|
@ -304,7 +304,7 @@ def main(argv):
|
|||
# We re-try (once) the extensions with unknown exceptions, as
|
||||
# they are often temporary
|
||||
has_exception = list(filter(lambda x: x.has_exception(), res))
|
||||
if has_exception != []:
|
||||
if has_exception:
|
||||
log_info(
|
||||
" {} extensions with unknown exceptions, start another try ...".
|
||||
format(str(len(has_exception))))
|
||||
|
@ -318,7 +318,7 @@ def main(argv):
|
|||
res = list(set(res) - set(has_exception)) + res_update
|
||||
|
||||
end_time = time.time()
|
||||
log_summary(res, end_time - start_time)
|
||||
log_summary(res, int(end_time - start_time))
|
||||
log_failures_to_file(log_dir, today, res)
|
||||
|
||||
|
||||
|
|
15
create-db
15
create-db
|
@ -17,7 +17,6 @@
|
|||
#
|
||||
|
||||
import getopt
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
import time
|
||||
|
@ -30,12 +29,12 @@ import datetime
|
|||
|
||||
from ExtensionCrawler.archive import update_db_incremental
|
||||
from ExtensionCrawler.config import *
|
||||
from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception
|
||||
from ExtensionCrawler.util import log_info, log_exception
|
||||
|
||||
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
|
||||
|
||||
|
||||
def help():
|
||||
def print_help():
|
||||
print("""create-db [OPTION]""")
|
||||
print(""" -h print this help text""")
|
||||
print(""" -a <DIR> archive directory""")
|
||||
|
@ -122,11 +121,11 @@ def parse_args(argv):
|
|||
"maxtaskid=", "from-date=", "until-date=", "help"
|
||||
])
|
||||
except getopt.GetoptError:
|
||||
help()
|
||||
print_help()
|
||||
sys.exit(2)
|
||||
for opt, arg in opts:
|
||||
if opt in ("-h", "--help"):
|
||||
help()
|
||||
print_help()
|
||||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
archive = arg
|
||||
|
@ -140,12 +139,12 @@ def parse_args(argv):
|
|||
taskid = int(arg)
|
||||
elif opt in ("-N", "--maxtaskid"):
|
||||
maxtaskid = int(arg)
|
||||
elif opt in ("--from-date"):
|
||||
elif opt == "--from-date":
|
||||
from_date = arg
|
||||
elif opt in ("--until-date"):
|
||||
elif opt == "--until-date":
|
||||
until_date = arg
|
||||
|
||||
if paths == []:
|
||||
if not paths:
|
||||
paths = list(find(archive, "*"))
|
||||
|
||||
chunksize = int(len(paths) / maxtaskid)
|
||||
|
|
|
@ -58,7 +58,6 @@ def main(argv):
|
|||
basedir = const_basedir()
|
||||
verbose = True
|
||||
date = None
|
||||
extid = ""
|
||||
useetag = False
|
||||
output = ""
|
||||
winfs = False
|
||||
|
|
|
@ -44,7 +44,6 @@ def main(argv):
|
|||
"""Main function of the extension crawler."""
|
||||
verbose = False
|
||||
silent = False
|
||||
filename = None
|
||||
csvfile = None
|
||||
database = True
|
||||
try:
|
||||
|
|
|
@ -39,7 +39,7 @@ import jsbeautifier
|
|||
|
||||
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
||||
from ExtensionCrawler.archive import last_crx, first_crx, all_crx
|
||||
from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
|
||||
from ExtensionCrawler.config import get_local_archive_dir
|
||||
from ExtensionCrawler.js_decomposer import init_file_info
|
||||
from ExtensionCrawler.js_mincer import mince_js
|
||||
|
||||
|
@ -54,7 +54,7 @@ def is_file_with_c_style_comments(filename):
|
|||
|
||||
def jsstrings_data(conf, path, data):
|
||||
"""Analyze data in memory."""
|
||||
if not conf.file_pattern is None:
|
||||
if conf.file_pattern is not None:
|
||||
if path is None:
|
||||
return False
|
||||
elif not fnmatch.fnmatch(path, conf.file_pattern):
|
||||
|
@ -98,7 +98,7 @@ def jsstrings_data(conf, path, data):
|
|||
if analyze_block(conf, path, block, first):
|
||||
match = True
|
||||
first = False
|
||||
if match and conf.output_decoration > 0 and conf.output_decoration < 2:
|
||||
if match and 0 < conf.output_decoration < 2:
|
||||
print(path)
|
||||
return match
|
||||
|
||||
|
@ -112,6 +112,7 @@ def print_block(conf,
|
|||
if conf.output_decoration > 1:
|
||||
line_no = block.start[0]
|
||||
prefix = " " * (block.start[1] - 1)
|
||||
# TODO: use classifier
|
||||
classifier = "X"
|
||||
sep = "=" * (len(path) + 17)
|
||||
if not first:
|
||||
|
@ -129,10 +130,10 @@ def print_block(conf,
|
|||
path, loc, block.string_literals[0].rstrip())
|
||||
print(line)
|
||||
else:
|
||||
for (pos, str) in block.string_literals:
|
||||
for (pos, string) in block.string_literals:
|
||||
loc = '({0[0]:d}/{0[1]:d})'.format(pos)
|
||||
loc = (' ' * (11 - len(loc))) + loc
|
||||
line = '{0} {1} [L]: {2}'.format(path, loc, str.rstrip())
|
||||
line = '{0} {1} [L]: {2}'.format(path, loc, string.rstrip())
|
||||
print(line)
|
||||
if code_match:
|
||||
print("-" * (len(path) + 17))
|
||||
|
@ -151,7 +152,7 @@ def analyze_block(conf, path, block, first=False):
|
|||
"""Print code/comment blocks."""
|
||||
match = False
|
||||
regexps = []
|
||||
if not conf.reg_exp is None:
|
||||
if conf.reg_exp is not None:
|
||||
for regexp in conf.reg_exp:
|
||||
if conf.case_insensitive:
|
||||
regexps.append(re.compile(r'(' + regexp + ')', re.IGNORECASE))
|
||||
|
@ -159,7 +160,7 @@ def analyze_block(conf, path, block, first=False):
|
|||
regexps.append(re.compile(r'(' + regexp + ')'))
|
||||
if block.is_comment():
|
||||
content = block.content
|
||||
if not conf.reg_exp_comments is None:
|
||||
if conf.reg_exp_comments is not None:
|
||||
for regexp in conf.reg_exp_comments:
|
||||
if conf.case_insensitive:
|
||||
regexps.append(
|
||||
|
@ -179,14 +180,14 @@ def analyze_block(conf, path, block, first=False):
|
|||
content = block.content
|
||||
regexps_string = regexps.copy()
|
||||
regexps_code = regexps.copy()
|
||||
if not conf.reg_exp_string_literals is None:
|
||||
if conf.reg_exp_string_literals is not None:
|
||||
for regexp in conf.reg_exp_string_literals:
|
||||
if conf.case_insensitive:
|
||||
regexps.append(
|
||||
re.compile(r'(' + regexp + ')', re.IGNORECASE))
|
||||
else:
|
||||
regexps.append(re.compile(r'(' + regexp + ')'))
|
||||
if not conf.reg_exp_source is None:
|
||||
if conf.reg_exp_source is not None:
|
||||
for regexp in conf.reg_exp_source:
|
||||
if conf.case_insensitive:
|
||||
regexps.append(
|
||||
|
@ -222,9 +223,9 @@ def analyze_block(conf, path, block, first=False):
|
|||
match_idxs.add(idx)
|
||||
string_match = True
|
||||
block.string_literals = []
|
||||
for idx, str in enumerate(string_literals):
|
||||
for idx, string in enumerate(string_literals):
|
||||
if idx in match_idxs:
|
||||
block.string_literals.append(str)
|
||||
block.string_literals.append(string)
|
||||
|
||||
code_match = False
|
||||
for regexp in regexps_code:
|
||||
|
@ -259,9 +260,6 @@ def analyze_crx(conf, crx, path=""):
|
|||
|
||||
def analyze_tar(conf, tarfilename):
|
||||
last_crx_file = ''
|
||||
# from_date
|
||||
# latest_date
|
||||
match = False
|
||||
extid = os.path.splitext(os.path.basename(tarfilename))[0]
|
||||
from_dateobj = None
|
||||
latest_dateobj = None
|
||||
|
@ -303,7 +301,7 @@ def analyze_tar(conf, tarfilename):
|
|||
# both dates are given
|
||||
all_crx_files = all_crx(
|
||||
os.path.join(conf.archive_dir, "data"), extid)
|
||||
if all_crx_files == []:
|
||||
if not all_crx_files:
|
||||
logging.warning("No crx in " + extid)
|
||||
else:
|
||||
with tarfile.open(tarfilename, 'r') as archive:
|
||||
|
@ -354,14 +352,13 @@ def analyze_task(conf, task):
|
|||
"""Analyze one file/tar/crx/extid."""
|
||||
logging.debug("Analyzing " + task)
|
||||
extid_re = re.compile('^[a-p]+$')
|
||||
retval = False
|
||||
if task.endswith('.crx'):
|
||||
retval = analyze_crx(conf, task)
|
||||
elif task.endswith('.tar'):
|
||||
retval = analyze_tar(conf, task)
|
||||
elif extid_re.match(task):
|
||||
tarfile = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
|
||||
retval = analyze_tar(conf, conf.archive_dir + "/" + tarfile)
|
||||
tarfilename = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
|
||||
retval = analyze_tar(conf, conf.archive_dir + "/" + tarfilename)
|
||||
else:
|
||||
retval = analyze_file(conf, task)
|
||||
return retval
|
||||
|
|
15
extfind
15
extfind
|
@ -21,10 +21,12 @@ import glob
|
|||
import os
|
||||
import sys
|
||||
import logging
|
||||
import re
|
||||
|
||||
from ExtensionCrawler import config
|
||||
|
||||
def help():
|
||||
|
||||
def print_help():
|
||||
print("""extfind [OPTION]""")
|
||||
print(""" -h print this help text""")
|
||||
print(""" -a <DIR> archive directory""")
|
||||
|
@ -51,7 +53,7 @@ def iter_extension_paths_from_file(archive, n, N, extidlistfile):
|
|||
if re.fullmatch("[a-p]{32}", line) and os.path.exists(path):
|
||||
paths += [path]
|
||||
else:
|
||||
logging.warn("WARNING: {} is not a valid extension path!".format(path))
|
||||
logging.warning("WARNING: {} is not a valid extension path!".format(path))
|
||||
return split(paths, n, N)
|
||||
|
||||
|
||||
|
@ -67,19 +69,17 @@ def main(argv):
|
|||
taskid = 1
|
||||
maxtaskid = 1
|
||||
|
||||
paths = []
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [
|
||||
"archive=", "glob=", "extidlistfile=", "taskid=",
|
||||
"maxtaskid=", "help"
|
||||
])
|
||||
except getopt.GetoptError:
|
||||
help()
|
||||
print_help()
|
||||
sys.exit(2)
|
||||
for opt, arg in opts:
|
||||
if opt in ("-h", "--help"):
|
||||
help()
|
||||
print_help()
|
||||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
archive = arg
|
||||
|
@ -99,11 +99,12 @@ def main(argv):
|
|||
elif extidglob is not None and extidlistfile is None:
|
||||
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
|
||||
else:
|
||||
help()
|
||||
print_help()
|
||||
sys.exit(2)
|
||||
|
||||
for path in paths:
|
||||
print(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
colorama==0.3.9
|
||||
pystuck==0.8.5
|
||||
simhash==1.8.0
|
||||
tabulate==0.7.7
|
||||
setuptools==36.2.7
|
||||
cchardet==2.1.1
|
||||
mysqlclient==1.3.10
|
||||
mysqlclient==1.3.12
|
||||
requests==2.18.1
|
||||
pycrypto==2.6.1
|
||||
beautifulsoup4==4.6.0
|
||||
|
|
5
setup.py
5
setup.py
|
@ -1,9 +1,12 @@
|
|||
from setuptools import setup
|
||||
|
||||
with open('requirements.txt') as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
setup(
|
||||
name='Extension Crawler',
|
||||
description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
|
||||
author='Achim D. Brucker, Michael Herzberg',
|
||||
license='GPL 3.0',
|
||||
install_requires=['GitPython', 'pebble', 'simhash', 'colorama', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier', 'pystuck']
|
||||
install_requires=requirements
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue