diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py
index ddea8be..1f4187d 100644
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@@ -23,7 +23,6 @@ import os
import glob
import re
import json
-from multiprocessing import Pool
from concurrent.futures import TimeoutError
from pebble import ProcessPool, ProcessExpired
from functools import partial
@@ -44,6 +43,7 @@ from ExtensionCrawler.config import (
from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_warning, log_exception, setup_logger
from ExtensionCrawler.db import update_db_incremental
+
class Error(Exception):
pass
@@ -82,11 +82,11 @@ class RequestResult:
class UpdateResult:
- def __init__(self, id, is_new, exception, res_overview, res_crx,
+ def __init__(self, ext_id, is_new, exception, res_overview, res_crx,
res_reviews, res_support, res_sql, sql_update, worker_exception=None):
- self.id = id
+ self.ext_id = ext_id
self.new = is_new
- self.exception = exception # TODO: should be tar_exception
+ self.exception = exception # TODO: should be tar_exception
self.res_overview = res_overview
self.res_crx = res_crx
self.res_reviews = res_reviews
@@ -188,7 +188,7 @@ def last_modified_http_date(path):
def last_crx(archivedir, extid, date=None):
- last_crx = ""
+ last_crx_path = ""
last_crx_etag = ""
etag_file = os.path.join(archivedir, get_local_archive_dir(extid),
@@ -198,14 +198,13 @@ def last_crx(archivedir, extid, date=None):
with open(etag_file, 'r') as f:
d = json.load(f)
return d["last_crx"], d["last_crx_etag"]
- except Exception as e:
+ except Exception:
log_exception("Something was wrong with the etag file {}, deleting it ...".format(etag_file))
try:
os.remove(etag_file)
- except Exception as e:
+ except Exception:
log_exception("Could not remove etag file {}!".format(etag_file))
-
# If we do not yet have an .etag file present, open the tarfile and look
# there for one. After having done that once, the crawler creates the .etag
# file to avoid opening the tar file in the future.
@@ -219,23 +218,23 @@ def last_crx(archivedir, extid, date=None):
date is None or (dateutil.parser.parse(
os.path.split(os.path.split(x.name)[0])[1]) <= date))
])
- if old_crxs != []:
- last_crx = old_crxs[-1]
+ if old_crxs:
+ last_crx_path = old_crxs[-1]
headers_content = t.extractfile(
- last_crx + ".headers").read().decode().replace(
+ last_crx_path + ".headers").read().decode().replace(
'"', '\\"').replace("'", '"')
headers_json = json.loads(headers_content)
last_crx_etag = headers_json["ETag"]
if date is None:
with open(etag_file, 'w') as f:
- json.dump({"last_crx": last_crx, "last_crx_etag": last_crx_etag}, f)
+ json.dump({"last_crx": last_crx_path, "last_crx_etag": last_crx_etag}, f)
- return last_crx, last_crx_etag
+ return last_crx_path, last_crx_etag
def first_crx(archivedir, extid, date=None):
- first_crx = ""
+ first_crx_path = ""
tar = os.path.join(archivedir, get_local_archive_dir(extid),
extid + ".tar")
if os.path.exists(tar):
@@ -247,10 +246,10 @@ def first_crx(archivedir, extid, date=None):
os.path.split(os.path.split(x.name)[0])[1])))
])
t.close()
- if old_crxs != []:
- first_crx = old_crxs[0]
+ if old_crxs:
+ first_crx_path = old_crxs[0]
- return first_crx
+ return first_crx_path
def all_crx(archivedir, extid, date=None):
@@ -283,7 +282,7 @@ def update_overview(tar, date, ext_id):
def validate_crx_response(res, extid, extfilename):
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
- if not 'Content-Type' in res.headers:
+ if 'Content-Type' not in res.headers:
raise CrawlError(extid, 'Did not find Content-Type header.', '\n'.join(
res.iter_lines()))
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
@@ -351,10 +350,12 @@ def update_crx(archivedir, tmptardir, ext_id, date):
f.write(chunk)
write_text(tmptardir, date, extfilename + ".etag",
res.headers.get("ETag"))
- etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id),
- ext_id + ".etag")
+ etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id + ".etag")
with open(etag_file, 'w') as f:
- json.dump({"last_crx": os.path.join(ext_id, date, extfilename), "last_crx_etag": res.headers.get("ETag")}, f)
+ json.dump({
+ "last_crx": os.path.join(ext_id, date, extfilename),
+ "last_crx_etag": res.headers.get("ETag")
+ }, f)
except Exception as e:
log_exception("Exception when updating crx", 3, ext_id)
write_text(tmptardir, date, extfilename + ".exception",
@@ -367,9 +368,10 @@ def iterate_authors(pages):
for page in pages:
json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
for annotation in json_page["annotations"]:
- if "attributes" in annotation and "replyExists" in annotation["attributes"] and annotation["attributes"]["replyExists"]:
- yield (annotation["entity"]["author"],
- annotation["entity"]["groups"])
+ if "attributes" in annotation:
+ if "replyExists" in annotation["attributes"]:
+ if annotation["attributes"]["replyExists"]:
+ yield (annotation["entity"]["author"], annotation["entity"]["groups"])
def update_reviews(tar, date, ext_id):
@@ -550,7 +552,7 @@ def update_extension(archivedir, forums, ext_id):
try:
write_text(tardir, date, ext_id + ".sql.exception",
traceback.format_exc())
- except Exception as e:
+ except Exception:
pass
try:
shutil.rmtree(path=tmpdir)
@@ -581,13 +583,11 @@ def init_process(verbose, start_pystuck=False):
pystuck.run_server(port=((os.getpid() % 10000) + 10001))
-def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
- results=[]
- with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process, initargs=(verbose, start_pystuck)) as pool:
- future = pool.map(partial(update_extension, archivedir, forums),
- ext_ids,
- chunksize=1,
- timeout=timeout)
+def execute_parallel(archivedir, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
+ results = []
+ with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process,
+ initargs=(verbose, start_pystuck)) as pool:
+ future = pool.map(partial(update_extension, archivedir, forums), ext_ids, chunksize=1, timeout=timeout)
iterator = future.result()
for ext_id in ext_ids:
try:
@@ -595,7 +595,7 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum
except StopIteration:
break
except TimeoutError as error:
- log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id,error.args[1]))
+ log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id, error.args[1]))
results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error))
except ProcessExpired as error:
log_warning("WorkerException: %s (%s), exit code: %d" % (error, ext_id, error.exitcode))
@@ -609,8 +609,6 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum
def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, verbose, start_pystuck):
- ext_with_forums = []
- ext_without_forums = []
forums_ext_ids = (list(set(forums_ext_ids)))
log_info("Updating {} extensions ({} including forums)".format(
@@ -621,13 +619,13 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, ve
parallel_ids = ext_ids
log_info("Updating {} extensions excluding forums (parallel)".format(
len(parallel_ids)), 1)
- ext_without_forums = execute_parallel(archivedir, 3, timeout, parallel, parallel_ids, False, verbose, start_pystuck)
+ ext_without_forums = execute_parallel(archivedir, timeout, parallel, parallel_ids, False, verbose, start_pystuck)
# Second, update extensions with forums sequentially (and with delays) to
# avoid running into Googles DDOS detection.
log_info("Updating {} extensions including forums (sequentially)".format(
len(forums_ext_ids)), 1)
- ext_with_forums = execute_parallel(archivedir, 3, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)
+ ext_with_forums = execute_parallel(archivedir, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)
return ext_with_forums + ext_without_forums
diff --git a/ExtensionCrawler/cdnjs_crawler.py b/ExtensionCrawler/cdnjs_crawler.py
index ce20a8c..b5f7f5b 100644
--- a/ExtensionCrawler/cdnjs_crawler.py
+++ b/ExtensionCrawler/cdnjs_crawler.py
@@ -169,10 +169,10 @@ def update_lib(force, archive, lib):
outphased = []
for lib_ver in local_lib_json['assets']:
version = lib_ver['version']
- if not version in cdnjs_versions:
+ if version not in cdnjs_versions:
logging.warning("Found outphased versions for " + name + " " +
str(version) + " , preserving from archive.")
- if not 'outphased' in lib_ver:
+ if 'outphased' not in lib_ver:
lib_ver[
'outphased'] = datetime.datetime.utcnow().isoformat()
outphased.append(lib_ver)
@@ -260,7 +260,7 @@ def delete_orphaned(archive, local_libs, cdnjs_current_libs):
"""Delete all orphaned local libaries."""
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
for lib in local_libs:
- if not lib in cdnjs_current_libs:
+ if lib not in cdnjs_current_libs:
os.remove(os.path.join(dirname, lib + ".json"))
diff --git a/ExtensionCrawler/cdnjs_git.py b/ExtensionCrawler/cdnjs_git.py
index bd66ad4..138555c 100644
--- a/ExtensionCrawler/cdnjs_git.py
+++ b/ExtensionCrawler/cdnjs_git.py
@@ -25,8 +25,7 @@ import logging
import os
import re
import sys
-from functools import partial, reduce
-from multiprocessing import Pool
+from functools import reduce
import dateutil.parser
import git
@@ -70,8 +69,8 @@ def pull_list_changed_files(git_path):
for diff in single_fetch_info.commit.diff(
single_fetch_info.old_commit):
logging.debug("Found diff: " + str(diff))
- if not diff.a_blob is None:
- if not diff.a_blob.path in files:
+ if diff.a_blob is not None:
+ if diff.a_blob.path not in files:
files.append(diff.a_blob.path)
return files
@@ -98,7 +97,7 @@ def hackish_pull_list_changed_files(git_path):
for line in pull_lines:
match = re.search(r'^ (.+) \| .*$', line)
- if not match is None:
+ if match is not None:
changed_files = match.group(1).split('=>')
for changed_file in changed_files:
files.add(changed_file.strip())
@@ -139,6 +138,7 @@ def get_file_libinfo(release_dic, git_path, libfile):
file_info['library'] = lib
file_info['version'] = version
file_info['add_date'] = release_dic[(lib, version)]
+ # TODO: why is package not used?
package = os.path.join(
reduce(os.path.join, plist[:idx + 1]), "package.json")
return file_info
@@ -167,7 +167,7 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
libvers = set()
files = []
versionidx = len(path_to_list(cdnjs_git_path)) + 4
- if not localpath is None:
+ if localpath is not None:
paths = os.path.join(cdnjs_git_path, localpath)
else:
paths = os.path.join(cdnjs_git_path, 'ajax/libs/**/*')
@@ -196,7 +196,7 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
if os.path.isfile(filename):
logging.info("Updating database for file " + filename)
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
- if not file_info is None:
+ if file_info is not None:
if create_csv:
print(file_info['path'])
print(cdnjs_git_path)
@@ -268,7 +268,7 @@ def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
retries = 0
success = False
max_retries = 4
- while (not success and (retries < max_retries)):
+ while not success and (retries < max_retries):
try:
update_database_for_file_chunked_timeout(create_csv, release_dic,
cdnjs_git_path, filenames)
@@ -305,7 +305,7 @@ def get_release_triple(git_path, libver):
lib = plist[-2]
date = get_add_date(git_path, libver)
logging.info("Release information:" + lib + " " + ver + ": " + str(date))
- return (lib, ver, date)
+ return lib, ver, date
def build_release_date_dic(git_path, libvers):
@@ -332,7 +332,6 @@ def pull_and_update_db(cdnjs_git_path, create_csv):
def update_db_from_listfile(cdnjs_git_path, listfile, create_csv):
"""Update database (without pull) for files in listfile)"""
- paths = []
with open(listfile) as listfileobj:
paths = listfileobj.read().splitlines()
files = []
diff --git a/ExtensionCrawler/config.py b/ExtensionCrawler/config.py
index 8011092..c6b7754 100644
--- a/ExtensionCrawler/config.py
+++ b/ExtensionCrawler/config.py
@@ -142,10 +142,12 @@ def const_verbose():
"""Default verbosity."""
return True
+
def const_use_process_pool():
"""Use ProcessPool (from module 'pebble') for concurrency."""
return False
+
def const_log_format():
return '%(process)6s %(asctime)s %(levelname)8s %(message)s'
@@ -154,14 +156,17 @@ def const_discover():
"""Default configuration of discovery mode"""
return False
+
def const_download_ext_ids_with_forums():
"""Download extensions with forums (sequential mode)"""
return True
+
def const_download_ext_ids_without_forums():
"""Download extensions without forums (parallel mode)"""
return True
+
def const_ext_timeout():
"""Timeout for downloading an individual extension (2 hours)."""
return 2*60*60
diff --git a/ExtensionCrawler/db.py b/ExtensionCrawler/db.py
index a54b6e0..9c1e278 100644
--- a/ExtensionCrawler/db.py
+++ b/ExtensionCrawler/db.py
@@ -15,13 +15,12 @@
# along with this program. If not, see .
#
-from ExtensionCrawler.config import *
-from ExtensionCrawler.util import *
-from ExtensionCrawler.crx import *
-from ExtensionCrawler.archive import *
-from ExtensionCrawler.js_decomposer import decompose_js_with_connection, DetectionType, FileClassification
+from ExtensionCrawler.config import const_mysql_config_file
+from ExtensionCrawler.crx import read_crx
+from ExtensionCrawler.js_decomposer import decompose_js_with_connection
+from ExtensionCrawler.util import log_warning, log_debug, log_exception, log_info
-from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
+from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend, convert_date
import re
from bs4 import BeautifulSoup
@@ -63,7 +62,7 @@ def get_etag(ext_id, datepath, con):
link = f.read()
linked_date = link[3:].split("/")[0]
- result = con.get_etag(ext_id, con.convert_date(linked_date))
+ result = con.get_etag(ext_id, convert_date(linked_date))
if result is not None:
return result
@@ -166,7 +165,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
con.insert(
"extension",
extid=ext_id,
- date=con.convert_date(date),
+ date=convert_date(date),
name=name,
version=version,
description=description,
@@ -184,12 +183,12 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
con.insert(
"category",
extid=ext_id,
- date=con.convert_date(date),
+ date=convert_date(date),
category_md5=hashlib.md5(category.encode()).digest(),
category=category)
-def parse_and_insert_crx(ext_id, date, datepath, con):
+def parse_and_insert_crx(ext_id, datepath, con):
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
if not crx_path:
return
@@ -314,7 +313,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con):
con.insert(
"review",
extid=ext_id,
- date=con.convert_date(date),
+ date=convert_date(date),
commentdate=datetime.datetime.utcfromtimestamp(
get(review, "timestamp")).isoformat()
if "timestamp" in review else None,
@@ -345,7 +344,7 @@ def parse_and_insert_support(ext_id, date, supportpath, con):
con.insert(
"support",
extid=ext_id,
- date=con.convert_date(date),
+ date=convert_date(date),
commentdate=datetime.datetime.utcfromtimestamp(
get(review, "timestamp")).isoformat()
if "timestamp" in review else None,
@@ -365,7 +364,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
log_debug("- parsing reply file", 3, ext_id)
with open(repliespath) as f:
d = json.load(f)
- if not "searchResults" in d:
+ if "searchResults" not in d:
log_warning("* WARNING: there are no search results in {}".format(
repliespath), 3, ext_id)
return
@@ -379,7 +378,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
con.insert(
"reply",
extid=ext_id,
- date=con.convert_date(date),
+ date=convert_date(date),
commentdate=datetime.datetime.utcfromtimestamp(
get(annotation, "timestamp")).isoformat()
if "timestamp" in annotation else None,
@@ -413,7 +412,7 @@ def parse_and_insert_status(ext_id, date, datepath, con):
con.insert(
"status",
extid=ext_id,
- date=con.convert_date(date),
+ date=convert_date(date),
crx_status=crx_status,
overview_status=overview_status,
overview_exception=overview_exception)
@@ -439,8 +438,8 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):
if etag:
try:
- parse_and_insert_crx(ext_id, date, datepath, con)
- except Exception as e:
+ parse_and_insert_crx(ext_id, datepath, con)
+ except Exception:
log_exception("Exception when parsing crx", 3, ext_id)
else:
crx_status = get_crx_status(datepath)
@@ -449,40 +448,40 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):
try:
parse_and_insert_overview(ext_id, date, datepath, con)
- except Exception as e:
+ except Exception:
log_exception("Exception when parsing overview", 3, ext_id)
try:
parse_and_insert_status(ext_id, date, datepath, con)
- except Exception as e:
+ except Exception:
log_exception("Exception when parsing status", 3, ext_id)
reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
for reviewpath in reviewpaths:
try:
parse_and_insert_review(ext_id, date, reviewpath, con)
- except json.decoder.JSONDecodeError as e:
+ except json.decoder.JSONDecodeError:
log_warning("- WARNING: Review is not a proper json file!", 3,
ext_id)
- except Exception as e:
+ except Exception:
log_exception("Exception when parsing review", 3, ext_id)
supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
for supportpath in supportpaths:
try:
parse_and_insert_support(ext_id, date, supportpath, con)
- except json.decoder.JSONDecodeError as e:
+ except json.decoder.JSONDecodeError:
log_warning("- WARNING: Support is not a proper json file!", 3,
ext_id)
- except Exception as e:
+ except Exception:
log_exception("Exception when parsing support", 3, ext_id)
repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
for repliespath in repliespaths:
try:
parse_and_insert_replies(ext_id, date, repliespath, con)
- except json.decoder.JSONDecodeError as e:
+ except json.decoder.JSONDecodeError:
log_warning("- WARNING: Reply is not a proper json file!", 3,
ext_id)
- except Exception as e:
+ except Exception:
log_exception("Exception when parsing reply", 3, ext_id)
diff --git a/ExtensionCrawler/dbbackend/mysql_backend.py b/ExtensionCrawler/dbbackend/mysql_backend.py
index 5a55c4f..fb0b39a 100644
--- a/ExtensionCrawler/dbbackend/mysql_backend.py
+++ b/ExtensionCrawler/dbbackend/mysql_backend.py
@@ -18,18 +18,17 @@
import time
import datetime
from random import uniform
-from itertools import starmap
-import logging
import MySQLdb
import _mysql_exceptions
import ExtensionCrawler.config as config
-from ExtensionCrawler.util import log_info, log_error, log_exception, log_warning
+from ExtensionCrawler.util import log_info, log_error, log_warning
class MysqlBackend:
- def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(), **kwargs):
+ def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(),
+ **kwargs):
self.ext_id = ext_id
self.dbargs = kwargs
self.try_wait = try_wait
@@ -147,5 +146,6 @@ class MysqlBackend:
result = self.retry(lambda: self.cursor.fetchone())
return result
- def convert_date(self, date):
- return date[:-6]
+
+def convert_date(date):
+ return date[:-6]
diff --git a/ExtensionCrawler/discover.py b/ExtensionCrawler/discover.py
index af51ba6..3ef9e86 100644
--- a/ExtensionCrawler/discover.py
+++ b/ExtensionCrawler/discover.py
@@ -17,12 +17,10 @@
"""Python mnodule providing methods for discovering extensions in the
Chrome extension store."""
-import xml.etree.ElementTree as ET
+from xml.etree.ElementTree import fromstring
import re
-from functools import reduce
import requests
from ExtensionCrawler import config
-from ExtensionCrawler.util import log_info, log_exception
def crawl_nearly_all_of_ext_ids():
@@ -30,7 +28,7 @@ def crawl_nearly_all_of_ext_ids():
def get_inner_elems(doc):
"""Get inner element."""
- return ET.fromstring(doc).iterfind(r".//{{{}}}loc".format(
+ return fromstring(doc).iterfind(r".//{{{}}}loc".format(
config.const_sitemap_scheme()))
def is_generic_url(url):
diff --git a/ExtensionCrawler/file_identifiers.py b/ExtensionCrawler/file_identifiers.py
index c12a5b8..31e0262 100644
--- a/ExtensionCrawler/file_identifiers.py
+++ b/ExtensionCrawler/file_identifiers.py
@@ -30,12 +30,14 @@ import magic
from ExtensionCrawler.js_mincer import mince_js
+
def is_binary_resource(mimetype_magic):
return (mimetype_magic.startswith("image/") or
mimetype_magic.startswith("video/") or
mimetype_magic.startswith("audio/") or
mimetype_magic == "application/pdf")
+
def normalize_jsdata(str_data):
"""Compute normalized code blocks of a JavaScript file"""
txt = ""
@@ -59,9 +61,8 @@ def get_features(s):
def get_simhash(encoding, data):
"""Compute simhash of text."""
- str_data = ""
- if not encoding is None:
- str_data = data.decode(encoding=encoding,errors="replace")
+ if encoding is not None:
+ str_data = data.decode(encoding=encoding, errors="replace")
else:
str_data = str(data)
simhash = Simhash(get_features(str_data)).value
@@ -82,31 +83,30 @@ def compute_difference(hx, hy):
def get_data_identifiers(data):
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
- data_identifier = {}
-
- data_identifier['encoding'] = None
- data_identifier['description'] = None
- data_identifier['size'] = None
- data_identifier['loc'] = None
- data_identifier['mimetype_magic'] = None
- data_identifier['md5'] = None
- data_identifier['sha1'] = None
- data_identifier['sha256'] = None
- data_identifier['simhash'] = None
- data_identifier['size_stripped'] = None
- data_identifier['normalized_encoding'] = None
- data_identifier['normalized_description'] = None
- data_identifier['normalized_size'] = None
- data_identifier['normalized_loc'] = None
- data_identifier['normalized_mimetype_magic'] = None
- data_identifier['normalized_md5'] = None
- data_identifier['normalized_sha1'] = None
- data_identifier['normalized_sha256'] = None
- data_identifier['normalized_simhash'] = None
+ data_identifier = {
+ 'encoding': None,
+ 'description': None,
+ 'size': None,
+ 'loc': None,
+ 'mimetype_magic': None,
+ 'md5': None,
+ 'sha1': None,
+ 'sha256': None,
+ 'simhash': None,
+ 'size_stripped': None,
+ 'normalized_encoding': None,
+ 'normalized_description': None,
+ 'normalized_size': None,
+ 'normalized_loc': None,
+ 'normalized_mimetype_magic': None,
+ 'normalized_md5': None,
+ 'normalized_sha1': None,
+ 'normalized_sha256': None,
+ 'normalized_simhash': None
+ }
mimetype_magic = magic.from_buffer(data, mime=True)
- magic_desc = ""
try:
magic_desc = magic.from_buffer(data)
except magic.MagicException as exp:
@@ -137,9 +137,10 @@ def get_data_identifiers(data):
data_identifier['encoding'] = encoding
try:
normalized_data, normalized_loc = normalize_jsdata(
- data.decode(encoding=data_identifier['encoding'],errors="replace"))
+ data.decode(encoding=data_identifier['encoding'], errors="replace"))
except Exception:
normalized_data = None
+ normalized_loc = 0
if normalized_data is not None:
normalized_magic_desc = ""
@@ -149,7 +150,7 @@ def get_data_identifiers(data):
rgx = re.compile(r' name use count.*$')
msg = str(exp.message)
if re.search(rgx, msg):
- magic_desc = re.sub(rgx, '', msg)
+ normalized_magic_desc = re.sub(rgx, '', msg)
else:
raise exp
normalized_encoding = chardet.detect(normalized_data)['encoding']
diff --git a/ExtensionCrawler/js_decomposer.py b/ExtensionCrawler/js_decomposer.py
index 7292874..62e5b2e 100644
--- a/ExtensionCrawler/js_decomposer.py
+++ b/ExtensionCrawler/js_decomposer.py
@@ -18,12 +18,10 @@
general and Chrome extensions in particular."""
import os
-import io
from io import StringIO
import re
import json
import zlib
-import logging
from enum import Enum
from ExtensionCrawler.js_mincer import mince_js
from ExtensionCrawler.file_identifiers import get_file_identifiers, is_binary_resource
@@ -107,15 +105,15 @@ def unknown_lib_identifiers():
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
- ), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
+ ), # MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
re.IGNORECASE
- ), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
+ ), # MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
re.compile(
r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
- ), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
+ ), # MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
re.compile(
r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
re.IGNORECASE),
@@ -188,13 +186,9 @@ def check_md5_decompressed(con, file_info):
"""Check for known md5 hash (decompressed file content)."""
if con is None:
return file_info
- if file_info['dec_md5'] is None:
- return file_info
- else:
+ if file_info['dec_md5'] is not None:
libver = con.get_cdnjs_info(file_info['dec_md5'])
- if libver is None:
- return file_info
- else:
+ if libver is not None:
file_info['lib'] = libver[0]
file_info['version'] = libver[1]
file_info['lib_filename'] = libver[2]
@@ -203,7 +197,6 @@ def check_md5_decompressed(con, file_info):
else:
file_info['type'] = FileClassification.LIBRARY
file_info['detectionMethod'] = DetectionType.MD5_DECOMPRESSED
- return file_info
return file_info
@@ -361,7 +354,7 @@ def analyse_comment_known_libs(zipfile, js_file, js_info, comment):
else:
filename = js_file
for lib, regex in load_lib_identifiers().items():
- if ('filecontent' in regex):
+ if 'filecontent' in regex:
for unkregex in regex['filecontent']:
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
@@ -481,13 +474,14 @@ def decompose_js_with_connection(path_or_zipfileobj, con):
try:
str_data = data.decode(file_info['encoding'])
except Exception:
- log_info("Exception during data decoding for entry " +
- file_info['filename'], 3)
+ log_info("Exception during data decoding for entry " + file_info['filename'], 3)
str_data = ''
else:
str_data = ''
info_data_blocks = check_data_blocks(file_info, str_data)
+ else:
+ info_data_blocks = None
if info_data_blocks:
inventory = inventory + merge_filename_and_data_info(
diff --git a/ExtensionCrawler/js_mincer.py b/ExtensionCrawler/js_mincer.py
index e93331e..12825cc 100644
--- a/ExtensionCrawler/js_mincer.py
+++ b/ExtensionCrawler/js_mincer.py
@@ -198,8 +198,8 @@ def mince_js_fileobj(fileobj):
except StopIteration:
pass
- if ((is_comment(state) and is_code_or_string_literal(suc_state)) or
- (is_code_or_string_literal(state) and is_comment(suc_state))):
+ if ((is_comment(state) and is_code_or_string_literal(suc_state)) or (
+ is_code_or_string_literal(state) and is_comment(suc_state))):
if content.strip():
yield (JsBlock(state, (block_start_line, block_start_cpos),
(line, cpos), content, string_literals))
diff --git a/cdnjs-git-miner b/cdnjs-git-miner
index ea139e2..47e5534 100755
--- a/cdnjs-git-miner
+++ b/cdnjs-git-miner
@@ -107,7 +107,7 @@ def main(argv):
logging.info("Starting update of new db libs")
pull_and_update_db(cdnjs_git_path, csv)
logging.info("Finished update of new db libs")
- if not listfile is None:
+ if listfile is not None:
logging.info("Starting update from list file")
update_db_from_listfile(cdnjs_git_path, listfile, csv)
logging.info("Finished update from list file")
diff --git a/crawler b/crawler
index fc23c20..3a39faf 100755
--- a/crawler
+++ b/crawler
@@ -19,7 +19,6 @@
A crawler for extensions from the Chrome Web Store.
"""
-import os
import sys
import datetime
import time
@@ -141,7 +140,7 @@ def log_summary(res, runtime=0):
log_info(" Total runtime: {}".format(
str(datetime.timedelta(seconds=int(runtime)))))
- if corrupt_tar_archives != []:
+ if corrupt_tar_archives:
log_info("")
log_info("List of extensions with corrupted files/archives:")
list(
@@ -229,7 +228,8 @@ def parse_args(argv):
max_discover = int(arg)
elif opt == '--pystuck':
start_pystuck = True
- return basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck
+ return [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums,
+ download_ext_ids_without_forums, ext_timeout, start_pystuck]
def main(argv):
@@ -242,8 +242,8 @@ def main(argv):
multiprocessing.set_start_method("forkserver")
today = datetime.datetime.now(datetime.timezone.utc).isoformat()
- basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck = parse_args(
- argv)
+ [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums,
+ ext_timeout, start_pystuck] = parse_args(argv)
setup_logger(verbose)
@@ -304,7 +304,7 @@ def main(argv):
# We re-try (once) the extensions with unknown exceptions, as
# they are often temporary
has_exception = list(filter(lambda x: x.has_exception(), res))
- if has_exception != []:
+ if has_exception:
log_info(
" {} extensions with unknown exceptions, start another try ...".
format(str(len(has_exception))))
@@ -318,7 +318,7 @@ def main(argv):
res = list(set(res) - set(has_exception)) + res_update
end_time = time.time()
- log_summary(res, end_time - start_time)
+ log_summary(res, int(end_time - start_time))
log_failures_to_file(log_dir, today, res)
diff --git a/create-db b/create-db
index 3d9e88e..df993dc 100755
--- a/create-db
+++ b/create-db
@@ -17,7 +17,6 @@
#
import getopt
-import os
import sys
import tarfile
import time
@@ -30,12 +29,12 @@ import datetime
from ExtensionCrawler.archive import update_db_incremental
from ExtensionCrawler.config import *
-from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception
+from ExtensionCrawler.util import log_info, log_exception
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
-def help():
+def print_help():
print("""create-db [OPTION]""")
print(""" -h print this help text""")
print(""" -a
archive directory""")
@@ -122,11 +121,11 @@ def parse_args(argv):
"maxtaskid=", "from-date=", "until-date=", "help"
])
except getopt.GetoptError:
- help()
+ print_help()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
- help()
+ print_help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
@@ -140,12 +139,12 @@ def parse_args(argv):
taskid = int(arg)
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
- elif opt in ("--from-date"):
+ elif opt == "--from-date":
from_date = arg
- elif opt in ("--until-date"):
+ elif opt == "--until-date":
until_date = arg
- if paths == []:
+ if not paths:
paths = list(find(archive, "*"))
chunksize = int(len(paths) / maxtaskid)
diff --git a/crx-extract b/crx-extract
index 781fd99..3b1ea23 100755
--- a/crx-extract
+++ b/crx-extract
@@ -58,7 +58,6 @@ def main(argv):
basedir = const_basedir()
verbose = True
date = None
- extid = ""
useetag = False
output = ""
winfs = False
diff --git a/crx-jsinventory b/crx-jsinventory
index 1cdce60..f7eab69 100755
--- a/crx-jsinventory
+++ b/crx-jsinventory
@@ -44,7 +44,6 @@ def main(argv):
"""Main function of the extension crawler."""
verbose = False
silent = False
- filename = None
csvfile = None
database = True
try:
diff --git a/crx-jsstrings b/crx-jsstrings
index 45cc7b2..d6a4b49 100755
--- a/crx-jsstrings
+++ b/crx-jsstrings
@@ -39,7 +39,7 @@ import jsbeautifier
from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import last_crx, first_crx, all_crx
-from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
+from ExtensionCrawler.config import get_local_archive_dir
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import mince_js
@@ -54,7 +54,7 @@ def is_file_with_c_style_comments(filename):
def jsstrings_data(conf, path, data):
"""Analyze data in memory."""
- if not conf.file_pattern is None:
+ if conf.file_pattern is not None:
if path is None:
return False
elif not fnmatch.fnmatch(path, conf.file_pattern):
@@ -98,7 +98,7 @@ def jsstrings_data(conf, path, data):
if analyze_block(conf, path, block, first):
match = True
first = False
- if match and conf.output_decoration > 0 and conf.output_decoration < 2:
+ if match and 0 < conf.output_decoration < 2:
print(path)
return match
@@ -112,6 +112,7 @@ def print_block(conf,
if conf.output_decoration > 1:
line_no = block.start[0]
prefix = " " * (block.start[1] - 1)
+ # TODO: use classifier
classifier = "X"
sep = "=" * (len(path) + 17)
if not first:
@@ -129,10 +130,10 @@ def print_block(conf,
path, loc, block.string_literals[0].rstrip())
print(line)
else:
- for (pos, str) in block.string_literals:
+ for (pos, string) in block.string_literals:
loc = '({0[0]:d}/{0[1]:d})'.format(pos)
loc = (' ' * (11 - len(loc))) + loc
- line = '{0} {1} [L]: {2}'.format(path, loc, str.rstrip())
+ line = '{0} {1} [L]: {2}'.format(path, loc, string.rstrip())
print(line)
if code_match:
print("-" * (len(path) + 17))
@@ -151,7 +152,7 @@ def analyze_block(conf, path, block, first=False):
"""Print code/comment blocks."""
match = False
regexps = []
- if not conf.reg_exp is None:
+ if conf.reg_exp is not None:
for regexp in conf.reg_exp:
if conf.case_insensitive:
regexps.append(re.compile(r'(' + regexp + ')', re.IGNORECASE))
@@ -159,7 +160,7 @@ def analyze_block(conf, path, block, first=False):
regexps.append(re.compile(r'(' + regexp + ')'))
if block.is_comment():
content = block.content
- if not conf.reg_exp_comments is None:
+ if conf.reg_exp_comments is not None:
for regexp in conf.reg_exp_comments:
if conf.case_insensitive:
regexps.append(
@@ -179,14 +180,14 @@ def analyze_block(conf, path, block, first=False):
content = block.content
regexps_string = regexps.copy()
regexps_code = regexps.copy()
- if not conf.reg_exp_string_literals is None:
+ if conf.reg_exp_string_literals is not None:
for regexp in conf.reg_exp_string_literals:
if conf.case_insensitive:
regexps.append(
re.compile(r'(' + regexp + ')', re.IGNORECASE))
else:
regexps.append(re.compile(r'(' + regexp + ')'))
- if not conf.reg_exp_source is None:
+ if conf.reg_exp_source is not None:
for regexp in conf.reg_exp_source:
if conf.case_insensitive:
regexps.append(
@@ -222,9 +223,9 @@ def analyze_block(conf, path, block, first=False):
match_idxs.add(idx)
string_match = True
block.string_literals = []
- for idx, str in enumerate(string_literals):
+ for idx, string in enumerate(string_literals):
if idx in match_idxs:
- block.string_literals.append(str)
+ block.string_literals.append(string)
code_match = False
for regexp in regexps_code:
@@ -259,9 +260,6 @@ def analyze_crx(conf, crx, path=""):
def analyze_tar(conf, tarfilename):
last_crx_file = ''
- # from_date
- # latest_date
- match = False
extid = os.path.splitext(os.path.basename(tarfilename))[0]
from_dateobj = None
latest_dateobj = None
@@ -303,7 +301,7 @@ def analyze_tar(conf, tarfilename):
# both dates are given
all_crx_files = all_crx(
os.path.join(conf.archive_dir, "data"), extid)
- if all_crx_files == []:
+ if not all_crx_files:
logging.warning("No crx in " + extid)
else:
with tarfile.open(tarfilename, 'r') as archive:
@@ -354,14 +352,13 @@ def analyze_task(conf, task):
"""Analyze one file/tar/crx/extid."""
logging.debug("Analyzing " + task)
extid_re = re.compile('^[a-p]+$')
- retval = False
if task.endswith('.crx'):
retval = analyze_crx(conf, task)
elif task.endswith('.tar'):
retval = analyze_tar(conf, task)
elif extid_re.match(task):
- tarfile = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
- retval = analyze_tar(conf, conf.archive_dir + "/" + tarfile)
+ tarfilename = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
+ retval = analyze_tar(conf, conf.archive_dir + "/" + tarfilename)
else:
retval = analyze_file(conf, task)
return retval
diff --git a/extfind b/extfind
index 9fb20c0..a76324d 100755
--- a/extfind
+++ b/extfind
@@ -21,10 +21,12 @@ import glob
import os
import sys
import logging
+import re
from ExtensionCrawler import config
-def help():
+
+def print_help():
print("""extfind [OPTION]""")
print(""" -h print this help text""")
print(""" -a archive directory""")
@@ -51,7 +53,7 @@ def iter_extension_paths_from_file(archive, n, N, extidlistfile):
if re.fullmatch("[a-p]{32}", line) and os.path.exists(path):
paths += [path]
else:
- logging.warn("WARNING: {} is not a valid extension path!".format(path))
+ logging.warning("WARNING: {} is not a valid extension path!".format(path))
return split(paths, n, N)
@@ -67,19 +69,17 @@ def main(argv):
taskid = 1
maxtaskid = 1
- paths = []
-
try:
opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [
"archive=", "glob=", "extidlistfile=", "taskid=",
"maxtaskid=", "help"
])
except getopt.GetoptError:
- help()
+ print_help()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
- help()
+ print_help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
@@ -99,11 +99,12 @@ def main(argv):
elif extidglob is not None and extidlistfile is None:
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
else:
- help()
+ print_help()
sys.exit(2)
for path in paths:
print(path)
+
if __name__ == "__main__":
main(sys.argv[1:])
diff --git a/requirements.txt b/requirements.txt
index afb0a2d..06020a9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,10 @@
+colorama==0.3.9
+pystuck==0.8.5
simhash==1.8.0
tabulate==0.7.7
setuptools==36.2.7
cchardet==2.1.1
-mysqlclient==1.3.10
+mysqlclient==1.3.12
requests==2.18.1
pycrypto==2.6.1
beautifulsoup4==4.6.0
diff --git a/setup.py b/setup.py
index 3f4e224..2eb52fd 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,12 @@
from setuptools import setup
+with open('requirements.txt') as f:
+ requirements = f.read().splitlines()
+
setup(
name='Extension Crawler',
description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
author='Achim D. Brucker, Michael Herzberg',
license='GPL 3.0',
- install_requires=['GitPython', 'pebble', 'simhash', 'colorama', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier', 'pystuck']
+ install_requires=requirements
)