diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index ddea8be..1f4187d 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -23,7 +23,6 @@ import os import glob import re import json -from multiprocessing import Pool from concurrent.futures import TimeoutError from pebble import ProcessPool, ProcessExpired from functools import partial @@ -44,6 +43,7 @@ from ExtensionCrawler.config import ( from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_warning, log_exception, setup_logger from ExtensionCrawler.db import update_db_incremental + class Error(Exception): pass @@ -82,11 +82,11 @@ class RequestResult: class UpdateResult: - def __init__(self, id, is_new, exception, res_overview, res_crx, + def __init__(self, ext_id, is_new, exception, res_overview, res_crx, res_reviews, res_support, res_sql, sql_update, worker_exception=None): - self.id = id + self.ext_id = ext_id self.new = is_new - self.exception = exception # TODO: should be tar_exception + self.exception = exception # TODO: should be tar_exception self.res_overview = res_overview self.res_crx = res_crx self.res_reviews = res_reviews @@ -188,7 +188,7 @@ def last_modified_http_date(path): def last_crx(archivedir, extid, date=None): - last_crx = "" + last_crx_path = "" last_crx_etag = "" etag_file = os.path.join(archivedir, get_local_archive_dir(extid), @@ -198,14 +198,13 @@ def last_crx(archivedir, extid, date=None): with open(etag_file, 'r') as f: d = json.load(f) return d["last_crx"], d["last_crx_etag"] - except Exception as e: + except Exception: log_exception("Something was wrong with the etag file {}, deleting it ...".format(etag_file)) try: os.remove(etag_file) - except Exception as e: + except Exception: log_exception("Could not remove etag file {}!".format(etag_file)) - # If we do not yet have an .etag file present, open the tarfile and look # there for one. After having done that once, the crawler creates the .etag # file to avoid opening the tar file in the future. @@ -219,23 +218,23 @@ def last_crx(archivedir, extid, date=None): date is None or (dateutil.parser.parse( os.path.split(os.path.split(x.name)[0])[1]) <= date)) ]) - if old_crxs != []: - last_crx = old_crxs[-1] + if old_crxs: + last_crx_path = old_crxs[-1] headers_content = t.extractfile( - last_crx + ".headers").read().decode().replace( + last_crx_path + ".headers").read().decode().replace( '"', '\\"').replace("'", '"') headers_json = json.loads(headers_content) last_crx_etag = headers_json["ETag"] if date is None: with open(etag_file, 'w') as f: - json.dump({"last_crx": last_crx, "last_crx_etag": last_crx_etag}, f) + json.dump({"last_crx": last_crx_path, "last_crx_etag": last_crx_etag}, f) - return last_crx, last_crx_etag + return last_crx_path, last_crx_etag def first_crx(archivedir, extid, date=None): - first_crx = "" + first_crx_path = "" tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ".tar") if os.path.exists(tar): @@ -247,10 +246,10 @@ def first_crx(archivedir, extid, date=None): os.path.split(os.path.split(x.name)[0])[1]))) ]) t.close() - if old_crxs != []: - first_crx = old_crxs[0] + if old_crxs: + first_crx_path = old_crxs[0] - return first_crx + return first_crx_path def all_crx(archivedir, extid, date=None): @@ -283,7 +282,7 @@ def update_overview(tar, date, ext_id): def validate_crx_response(res, extid, extfilename): regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$') - if not 'Content-Type' in res.headers: + if 'Content-Type' not in res.headers: raise CrawlError(extid, 'Did not find Content-Type header.', '\n'.join( res.iter_lines())) if not res.headers['Content-Type'] == 'application/x-chrome-extension': @@ -351,10 +350,12 @@ def update_crx(archivedir, tmptardir, ext_id, date): f.write(chunk) write_text(tmptardir, date, extfilename + ".etag", res.headers.get("ETag")) - etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), - ext_id + ".etag") + etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id + ".etag") with open(etag_file, 'w') as f: - json.dump({"last_crx": os.path.join(ext_id, date, extfilename), "last_crx_etag": res.headers.get("ETag")}, f) + json.dump({ + "last_crx": os.path.join(ext_id, date, extfilename), + "last_crx_etag": res.headers.get("ETag") + }, f) except Exception as e: log_exception("Exception when updating crx", 3, ext_id) write_text(tmptardir, date, extfilename + ".exception", @@ -367,9 +368,10 @@ def iterate_authors(pages): for page in pages: json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1]) for annotation in json_page["annotations"]: - if "attributes" in annotation and "replyExists" in annotation["attributes"] and annotation["attributes"]["replyExists"]: - yield (annotation["entity"]["author"], - annotation["entity"]["groups"]) + if "attributes" in annotation: + if "replyExists" in annotation["attributes"]: + if annotation["attributes"]["replyExists"]: + yield (annotation["entity"]["author"], annotation["entity"]["groups"]) def update_reviews(tar, date, ext_id): @@ -550,7 +552,7 @@ def update_extension(archivedir, forums, ext_id): try: write_text(tardir, date, ext_id + ".sql.exception", traceback.format_exc()) - except Exception as e: + except Exception: pass try: shutil.rmtree(path=tmpdir) @@ -581,13 +583,11 @@ def init_process(verbose, start_pystuck=False): pystuck.run_server(port=((os.getpid() % 10000) + 10001)) -def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forums, verbose, start_pystuck): - results=[] - with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process, initargs=(verbose, start_pystuck)) as pool: - future = pool.map(partial(update_extension, archivedir, forums), - ext_ids, - chunksize=1, - timeout=timeout) +def execute_parallel(archivedir, timeout, max_workers, ext_ids, forums, verbose, start_pystuck): + results = [] + with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process, + initargs=(verbose, start_pystuck)) as pool: + future = pool.map(partial(update_extension, archivedir, forums), ext_ids, chunksize=1, timeout=timeout) iterator = future.result() for ext_id in ext_ids: try: @@ -595,7 +595,7 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum except StopIteration: break except TimeoutError as error: - log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id,error.args[1])) + log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id, error.args[1])) results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error)) except ProcessExpired as error: log_warning("WorkerException: %s (%s), exit code: %d" % (error, ext_id, error.exitcode)) @@ -609,8 +609,6 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, verbose, start_pystuck): - ext_with_forums = [] - ext_without_forums = [] forums_ext_ids = (list(set(forums_ext_ids))) log_info("Updating {} extensions ({} including forums)".format( @@ -621,13 +619,13 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, ve parallel_ids = ext_ids log_info("Updating {} extensions excluding forums (parallel)".format( len(parallel_ids)), 1) - ext_without_forums = execute_parallel(archivedir, 3, timeout, parallel, parallel_ids, False, verbose, start_pystuck) + ext_without_forums = execute_parallel(archivedir, timeout, parallel, parallel_ids, False, verbose, start_pystuck) # Second, update extensions with forums sequentially (and with delays) to # avoid running into Googles DDOS detection. log_info("Updating {} extensions including forums (sequentially)".format( len(forums_ext_ids)), 1) - ext_with_forums = execute_parallel(archivedir, 3, timeout, 1, forums_ext_ids, True, verbose, start_pystuck) + ext_with_forums = execute_parallel(archivedir, timeout, 1, forums_ext_ids, True, verbose, start_pystuck) return ext_with_forums + ext_without_forums diff --git a/ExtensionCrawler/cdnjs_crawler.py b/ExtensionCrawler/cdnjs_crawler.py index ce20a8c..b5f7f5b 100644 --- a/ExtensionCrawler/cdnjs_crawler.py +++ b/ExtensionCrawler/cdnjs_crawler.py @@ -169,10 +169,10 @@ def update_lib(force, archive, lib): outphased = [] for lib_ver in local_lib_json['assets']: version = lib_ver['version'] - if not version in cdnjs_versions: + if version not in cdnjs_versions: logging.warning("Found outphased versions for " + name + " " + str(version) + " , preserving from archive.") - if not 'outphased' in lib_ver: + if 'outphased' not in lib_ver: lib_ver[ 'outphased'] = datetime.datetime.utcnow().isoformat() outphased.append(lib_ver) @@ -260,7 +260,7 @@ def delete_orphaned(archive, local_libs, cdnjs_current_libs): """Delete all orphaned local libaries.""" dirname = os.path.join(archive, "filedb", "cdnjs", "lib") for lib in local_libs: - if not lib in cdnjs_current_libs: + if lib not in cdnjs_current_libs: os.remove(os.path.join(dirname, lib + ".json")) diff --git a/ExtensionCrawler/cdnjs_git.py b/ExtensionCrawler/cdnjs_git.py index bd66ad4..138555c 100644 --- a/ExtensionCrawler/cdnjs_git.py +++ b/ExtensionCrawler/cdnjs_git.py @@ -25,8 +25,7 @@ import logging import os import re import sys -from functools import partial, reduce -from multiprocessing import Pool +from functools import reduce import dateutil.parser import git @@ -70,8 +69,8 @@ def pull_list_changed_files(git_path): for diff in single_fetch_info.commit.diff( single_fetch_info.old_commit): logging.debug("Found diff: " + str(diff)) - if not diff.a_blob is None: - if not diff.a_blob.path in files: + if diff.a_blob is not None: + if diff.a_blob.path not in files: files.append(diff.a_blob.path) return files @@ -98,7 +97,7 @@ def hackish_pull_list_changed_files(git_path): for line in pull_lines: match = re.search(r'^ (.+) \| .*$', line) - if not match is None: + if match is not None: changed_files = match.group(1).split('=>') for changed_file in changed_files: files.add(changed_file.strip()) @@ -139,6 +138,7 @@ def get_file_libinfo(release_dic, git_path, libfile): file_info['library'] = lib file_info['version'] = version file_info['add_date'] = release_dic[(lib, version)] + # TODO: why is package not used? package = os.path.join( reduce(os.path.join, plist[:idx + 1]), "package.json") return file_info @@ -167,7 +167,7 @@ def get_all_lib_files(cdnjs_git_path, localpath=None): libvers = set() files = [] versionidx = len(path_to_list(cdnjs_git_path)) + 4 - if not localpath is None: + if localpath is not None: paths = os.path.join(cdnjs_git_path, localpath) else: paths = os.path.join(cdnjs_git_path, 'ajax/libs/**/*') @@ -196,7 +196,7 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename, if os.path.isfile(filename): logging.info("Updating database for file " + filename) file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename) - if not file_info is None: + if file_info is not None: if create_csv: print(file_info['path']) print(cdnjs_git_path) @@ -268,7 +268,7 @@ def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, retries = 0 success = False max_retries = 4 - while (not success and (retries < max_retries)): + while not success and (retries < max_retries): try: update_database_for_file_chunked_timeout(create_csv, release_dic, cdnjs_git_path, filenames) @@ -305,7 +305,7 @@ def get_release_triple(git_path, libver): lib = plist[-2] date = get_add_date(git_path, libver) logging.info("Release information:" + lib + " " + ver + ": " + str(date)) - return (lib, ver, date) + return lib, ver, date def build_release_date_dic(git_path, libvers): @@ -332,7 +332,6 @@ def pull_and_update_db(cdnjs_git_path, create_csv): def update_db_from_listfile(cdnjs_git_path, listfile, create_csv): """Update database (without pull) for files in listfile)""" - paths = [] with open(listfile) as listfileobj: paths = listfileobj.read().splitlines() files = [] diff --git a/ExtensionCrawler/config.py b/ExtensionCrawler/config.py index 8011092..c6b7754 100644 --- a/ExtensionCrawler/config.py +++ b/ExtensionCrawler/config.py @@ -142,10 +142,12 @@ def const_verbose(): """Default verbosity.""" return True + def const_use_process_pool(): """Use ProcessPool (from module 'pebble') for concurrency.""" return False + def const_log_format(): return '%(process)6s %(asctime)s %(levelname)8s %(message)s' @@ -154,14 +156,17 @@ def const_discover(): """Default configuration of discovery mode""" return False + def const_download_ext_ids_with_forums(): """Download extensions with forums (sequential mode)""" return True + def const_download_ext_ids_without_forums(): """Download extensions without forums (parallel mode)""" return True + def const_ext_timeout(): """Timeout for downloading an individual extension (2 hours).""" return 2*60*60 diff --git a/ExtensionCrawler/db.py b/ExtensionCrawler/db.py index a54b6e0..9c1e278 100644 --- a/ExtensionCrawler/db.py +++ b/ExtensionCrawler/db.py @@ -15,13 +15,12 @@ # along with this program. If not, see . # -from ExtensionCrawler.config import * -from ExtensionCrawler.util import * -from ExtensionCrawler.crx import * -from ExtensionCrawler.archive import * -from ExtensionCrawler.js_decomposer import decompose_js_with_connection, DetectionType, FileClassification +from ExtensionCrawler.config import const_mysql_config_file +from ExtensionCrawler.crx import read_crx +from ExtensionCrawler.js_decomposer import decompose_js_with_connection +from ExtensionCrawler.util import log_warning, log_debug, log_exception, log_info -from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend +from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend, convert_date import re from bs4 import BeautifulSoup @@ -63,7 +62,7 @@ def get_etag(ext_id, datepath, con): link = f.read() linked_date = link[3:].split("/")[0] - result = con.get_etag(ext_id, con.convert_date(linked_date)) + result = con.get_etag(ext_id, convert_date(linked_date)) if result is not None: return result @@ -166,7 +165,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con): con.insert( "extension", extid=ext_id, - date=con.convert_date(date), + date=convert_date(date), name=name, version=version, description=description, @@ -184,12 +183,12 @@ def parse_and_insert_overview(ext_id, date, datepath, con): con.insert( "category", extid=ext_id, - date=con.convert_date(date), + date=convert_date(date), category_md5=hashlib.md5(category.encode()).digest(), category=category) -def parse_and_insert_crx(ext_id, date, datepath, con): +def parse_and_insert_crx(ext_id, datepath, con): crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) if not crx_path: return @@ -314,7 +313,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con): con.insert( "review", extid=ext_id, - date=con.convert_date(date), + date=convert_date(date), commentdate=datetime.datetime.utcfromtimestamp( get(review, "timestamp")).isoformat() if "timestamp" in review else None, @@ -345,7 +344,7 @@ def parse_and_insert_support(ext_id, date, supportpath, con): con.insert( "support", extid=ext_id, - date=con.convert_date(date), + date=convert_date(date), commentdate=datetime.datetime.utcfromtimestamp( get(review, "timestamp")).isoformat() if "timestamp" in review else None, @@ -365,7 +364,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con): log_debug("- parsing reply file", 3, ext_id) with open(repliespath) as f: d = json.load(f) - if not "searchResults" in d: + if "searchResults" not in d: log_warning("* WARNING: there are no search results in {}".format( repliespath), 3, ext_id) return @@ -379,7 +378,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con): con.insert( "reply", extid=ext_id, - date=con.convert_date(date), + date=convert_date(date), commentdate=datetime.datetime.utcfromtimestamp( get(annotation, "timestamp")).isoformat() if "timestamp" in annotation else None, @@ -413,7 +412,7 @@ def parse_and_insert_status(ext_id, date, datepath, con): con.insert( "status", extid=ext_id, - date=con.convert_date(date), + date=convert_date(date), crx_status=crx_status, overview_status=overview_status, overview_exception=overview_exception) @@ -439,8 +438,8 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con): if etag: try: - parse_and_insert_crx(ext_id, date, datepath, con) - except Exception as e: + parse_and_insert_crx(ext_id, datepath, con) + except Exception: log_exception("Exception when parsing crx", 3, ext_id) else: crx_status = get_crx_status(datepath) @@ -449,40 +448,40 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con): try: parse_and_insert_overview(ext_id, date, datepath, con) - except Exception as e: + except Exception: log_exception("Exception when parsing overview", 3, ext_id) try: parse_and_insert_status(ext_id, date, datepath, con) - except Exception as e: + except Exception: log_exception("Exception when parsing status", 3, ext_id) reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text")) for reviewpath in reviewpaths: try: parse_and_insert_review(ext_id, date, reviewpath, con) - except json.decoder.JSONDecodeError as e: + except json.decoder.JSONDecodeError: log_warning("- WARNING: Review is not a proper json file!", 3, ext_id) - except Exception as e: + except Exception: log_exception("Exception when parsing review", 3, ext_id) supportpaths = glob.glob(os.path.join(datepath, "support*-*.text")) for supportpath in supportpaths: try: parse_and_insert_support(ext_id, date, supportpath, con) - except json.decoder.JSONDecodeError as e: + except json.decoder.JSONDecodeError: log_warning("- WARNING: Support is not a proper json file!", 3, ext_id) - except Exception as e: + except Exception: log_exception("Exception when parsing support", 3, ext_id) repliespaths = glob.glob(os.path.join(datepath, "*replies.text")) for repliespath in repliespaths: try: parse_and_insert_replies(ext_id, date, repliespath, con) - except json.decoder.JSONDecodeError as e: + except json.decoder.JSONDecodeError: log_warning("- WARNING: Reply is not a proper json file!", 3, ext_id) - except Exception as e: + except Exception: log_exception("Exception when parsing reply", 3, ext_id) diff --git a/ExtensionCrawler/dbbackend/mysql_backend.py b/ExtensionCrawler/dbbackend/mysql_backend.py index 5a55c4f..fb0b39a 100644 --- a/ExtensionCrawler/dbbackend/mysql_backend.py +++ b/ExtensionCrawler/dbbackend/mysql_backend.py @@ -18,18 +18,17 @@ import time import datetime from random import uniform -from itertools import starmap -import logging import MySQLdb import _mysql_exceptions import ExtensionCrawler.config as config -from ExtensionCrawler.util import log_info, log_error, log_exception, log_warning +from ExtensionCrawler.util import log_info, log_error, log_warning class MysqlBackend: - def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(), **kwargs): + def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(), + **kwargs): self.ext_id = ext_id self.dbargs = kwargs self.try_wait = try_wait @@ -147,5 +146,6 @@ class MysqlBackend: result = self.retry(lambda: self.cursor.fetchone()) return result - def convert_date(self, date): - return date[:-6] + +def convert_date(date): + return date[:-6] diff --git a/ExtensionCrawler/discover.py b/ExtensionCrawler/discover.py index af51ba6..3ef9e86 100644 --- a/ExtensionCrawler/discover.py +++ b/ExtensionCrawler/discover.py @@ -17,12 +17,10 @@ """Python mnodule providing methods for discovering extensions in the Chrome extension store.""" -import xml.etree.ElementTree as ET +from xml.etree.ElementTree import fromstring import re -from functools import reduce import requests from ExtensionCrawler import config -from ExtensionCrawler.util import log_info, log_exception def crawl_nearly_all_of_ext_ids(): @@ -30,7 +28,7 @@ def crawl_nearly_all_of_ext_ids(): def get_inner_elems(doc): """Get inner element.""" - return ET.fromstring(doc).iterfind(r".//{{{}}}loc".format( + return fromstring(doc).iterfind(r".//{{{}}}loc".format( config.const_sitemap_scheme())) def is_generic_url(url): diff --git a/ExtensionCrawler/file_identifiers.py b/ExtensionCrawler/file_identifiers.py index c12a5b8..31e0262 100644 --- a/ExtensionCrawler/file_identifiers.py +++ b/ExtensionCrawler/file_identifiers.py @@ -30,12 +30,14 @@ import magic from ExtensionCrawler.js_mincer import mince_js + def is_binary_resource(mimetype_magic): return (mimetype_magic.startswith("image/") or mimetype_magic.startswith("video/") or mimetype_magic.startswith("audio/") or mimetype_magic == "application/pdf") + def normalize_jsdata(str_data): """Compute normalized code blocks of a JavaScript file""" txt = "" @@ -59,9 +61,8 @@ def get_features(s): def get_simhash(encoding, data): """Compute simhash of text.""" - str_data = "" - if not encoding is None: - str_data = data.decode(encoding=encoding,errors="replace") + if encoding is not None: + str_data = data.decode(encoding=encoding, errors="replace") else: str_data = str(data) simhash = Simhash(get_features(str_data)).value @@ -82,31 +83,30 @@ def compute_difference(hx, hy): def get_data_identifiers(data): """Get basic data identifiers (size, hashes, normalized hashes, etc.).""" - data_identifier = {} - - data_identifier['encoding'] = None - data_identifier['description'] = None - data_identifier['size'] = None - data_identifier['loc'] = None - data_identifier['mimetype_magic'] = None - data_identifier['md5'] = None - data_identifier['sha1'] = None - data_identifier['sha256'] = None - data_identifier['simhash'] = None - data_identifier['size_stripped'] = None - data_identifier['normalized_encoding'] = None - data_identifier['normalized_description'] = None - data_identifier['normalized_size'] = None - data_identifier['normalized_loc'] = None - data_identifier['normalized_mimetype_magic'] = None - data_identifier['normalized_md5'] = None - data_identifier['normalized_sha1'] = None - data_identifier['normalized_sha256'] = None - data_identifier['normalized_simhash'] = None + data_identifier = { + 'encoding': None, + 'description': None, + 'size': None, + 'loc': None, + 'mimetype_magic': None, + 'md5': None, + 'sha1': None, + 'sha256': None, + 'simhash': None, + 'size_stripped': None, + 'normalized_encoding': None, + 'normalized_description': None, + 'normalized_size': None, + 'normalized_loc': None, + 'normalized_mimetype_magic': None, + 'normalized_md5': None, + 'normalized_sha1': None, + 'normalized_sha256': None, + 'normalized_simhash': None + } mimetype_magic = magic.from_buffer(data, mime=True) - magic_desc = "" try: magic_desc = magic.from_buffer(data) except magic.MagicException as exp: @@ -137,9 +137,10 @@ def get_data_identifiers(data): data_identifier['encoding'] = encoding try: normalized_data, normalized_loc = normalize_jsdata( - data.decode(encoding=data_identifier['encoding'],errors="replace")) + data.decode(encoding=data_identifier['encoding'], errors="replace")) except Exception: normalized_data = None + normalized_loc = 0 if normalized_data is not None: normalized_magic_desc = "" @@ -149,7 +150,7 @@ def get_data_identifiers(data): rgx = re.compile(r' name use count.*$') msg = str(exp.message) if re.search(rgx, msg): - magic_desc = re.sub(rgx, '', msg) + normalized_magic_desc = re.sub(rgx, '', msg) else: raise exp normalized_encoding = chardet.detect(normalized_data)['encoding'] diff --git a/ExtensionCrawler/js_decomposer.py b/ExtensionCrawler/js_decomposer.py index 7292874..62e5b2e 100644 --- a/ExtensionCrawler/js_decomposer.py +++ b/ExtensionCrawler/js_decomposer.py @@ -18,12 +18,10 @@ general and Chrome extensions in particular.""" import os -import io from io import StringIO import re import json import zlib -import logging from enum import Enum from ExtensionCrawler.js_mincer import mince_js from ExtensionCrawler.file_identifiers import get_file_identifiers, is_binary_resource @@ -107,15 +105,15 @@ def unknown_lib_identifiers(): re.compile( r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', re.IGNORECASE - ), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8 + ), # MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8 re.compile( r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)', re.IGNORECASE - ), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8 + ), # MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8 re.compile( r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', re.IGNORECASE - ), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc. + ), # MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc. re.compile( r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?', re.IGNORECASE), @@ -188,13 +186,9 @@ def check_md5_decompressed(con, file_info): """Check for known md5 hash (decompressed file content).""" if con is None: return file_info - if file_info['dec_md5'] is None: - return file_info - else: + if file_info['dec_md5'] is not None: libver = con.get_cdnjs_info(file_info['dec_md5']) - if libver is None: - return file_info - else: + if libver is not None: file_info['lib'] = libver[0] file_info['version'] = libver[1] file_info['lib_filename'] = libver[2] @@ -203,7 +197,6 @@ def check_md5_decompressed(con, file_info): else: file_info['type'] = FileClassification.LIBRARY file_info['detectionMethod'] = DetectionType.MD5_DECOMPRESSED - return file_info return file_info @@ -361,7 +354,7 @@ def analyse_comment_known_libs(zipfile, js_file, js_info, comment): else: filename = js_file for lib, regex in load_lib_identifiers().items(): - if ('filecontent' in regex): + if 'filecontent' in regex: for unkregex in regex['filecontent']: unkown_lib_matched = unkregex.finditer(comment.content) for match in unkown_lib_matched: @@ -481,13 +474,14 @@ def decompose_js_with_connection(path_or_zipfileobj, con): try: str_data = data.decode(file_info['encoding']) except Exception: - log_info("Exception during data decoding for entry " + - file_info['filename'], 3) + log_info("Exception during data decoding for entry " + file_info['filename'], 3) str_data = '' else: str_data = '' info_data_blocks = check_data_blocks(file_info, str_data) + else: + info_data_blocks = None if info_data_blocks: inventory = inventory + merge_filename_and_data_info( diff --git a/ExtensionCrawler/js_mincer.py b/ExtensionCrawler/js_mincer.py index e93331e..12825cc 100644 --- a/ExtensionCrawler/js_mincer.py +++ b/ExtensionCrawler/js_mincer.py @@ -198,8 +198,8 @@ def mince_js_fileobj(fileobj): except StopIteration: pass - if ((is_comment(state) and is_code_or_string_literal(suc_state)) or - (is_code_or_string_literal(state) and is_comment(suc_state))): + if ((is_comment(state) and is_code_or_string_literal(suc_state)) or ( + is_code_or_string_literal(state) and is_comment(suc_state))): if content.strip(): yield (JsBlock(state, (block_start_line, block_start_cpos), (line, cpos), content, string_literals)) diff --git a/cdnjs-git-miner b/cdnjs-git-miner index ea139e2..47e5534 100755 --- a/cdnjs-git-miner +++ b/cdnjs-git-miner @@ -107,7 +107,7 @@ def main(argv): logging.info("Starting update of new db libs") pull_and_update_db(cdnjs_git_path, csv) logging.info("Finished update of new db libs") - if not listfile is None: + if listfile is not None: logging.info("Starting update from list file") update_db_from_listfile(cdnjs_git_path, listfile, csv) logging.info("Finished update from list file") diff --git a/crawler b/crawler index fc23c20..3a39faf 100755 --- a/crawler +++ b/crawler @@ -19,7 +19,6 @@ A crawler for extensions from the Chrome Web Store. """ -import os import sys import datetime import time @@ -141,7 +140,7 @@ def log_summary(res, runtime=0): log_info(" Total runtime: {}".format( str(datetime.timedelta(seconds=int(runtime))))) - if corrupt_tar_archives != []: + if corrupt_tar_archives: log_info("") log_info("List of extensions with corrupted files/archives:") list( @@ -229,7 +228,8 @@ def parse_args(argv): max_discover = int(arg) elif opt == '--pystuck': start_pystuck = True - return basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck + return [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, + download_ext_ids_without_forums, ext_timeout, start_pystuck] def main(argv): @@ -242,8 +242,8 @@ def main(argv): multiprocessing.set_start_method("forkserver") today = datetime.datetime.now(datetime.timezone.utc).isoformat() - basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck = parse_args( - argv) + [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, + ext_timeout, start_pystuck] = parse_args(argv) setup_logger(verbose) @@ -304,7 +304,7 @@ def main(argv): # We re-try (once) the extensions with unknown exceptions, as # they are often temporary has_exception = list(filter(lambda x: x.has_exception(), res)) - if has_exception != []: + if has_exception: log_info( " {} extensions with unknown exceptions, start another try ...". format(str(len(has_exception)))) @@ -318,7 +318,7 @@ def main(argv): res = list(set(res) - set(has_exception)) + res_update end_time = time.time() - log_summary(res, end_time - start_time) + log_summary(res, int(end_time - start_time)) log_failures_to_file(log_dir, today, res) diff --git a/create-db b/create-db index 3d9e88e..df993dc 100755 --- a/create-db +++ b/create-db @@ -17,7 +17,6 @@ # import getopt -import os import sys import tarfile import time @@ -30,12 +29,12 @@ import datetime from ExtensionCrawler.archive import update_db_incremental from ExtensionCrawler.config import * -from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception +from ExtensionCrawler.util import log_info, log_exception from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend -def help(): +def print_help(): print("""create-db [OPTION]""") print(""" -h print this help text""") print(""" -a archive directory""") @@ -122,11 +121,11 @@ def parse_args(argv): "maxtaskid=", "from-date=", "until-date=", "help" ]) except getopt.GetoptError: - help() + print_help() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): - help() + print_help() sys.exit() elif opt in ("-a", "--archive"): archive = arg @@ -140,12 +139,12 @@ def parse_args(argv): taskid = int(arg) elif opt in ("-N", "--maxtaskid"): maxtaskid = int(arg) - elif opt in ("--from-date"): + elif opt == "--from-date": from_date = arg - elif opt in ("--until-date"): + elif opt == "--until-date": until_date = arg - if paths == []: + if not paths: paths = list(find(archive, "*")) chunksize = int(len(paths) / maxtaskid) diff --git a/crx-extract b/crx-extract index 781fd99..3b1ea23 100755 --- a/crx-extract +++ b/crx-extract @@ -58,7 +58,6 @@ def main(argv): basedir = const_basedir() verbose = True date = None - extid = "" useetag = False output = "" winfs = False diff --git a/crx-jsinventory b/crx-jsinventory index 1cdce60..f7eab69 100755 --- a/crx-jsinventory +++ b/crx-jsinventory @@ -44,7 +44,6 @@ def main(argv): """Main function of the extension crawler.""" verbose = False silent = False - filename = None csvfile = None database = True try: diff --git a/crx-jsstrings b/crx-jsstrings index 45cc7b2..d6a4b49 100755 --- a/crx-jsstrings +++ b/crx-jsstrings @@ -39,7 +39,7 @@ import jsbeautifier from ExtensionCrawler.config import (const_log_format, const_basedir) from ExtensionCrawler.archive import last_crx, first_crx, all_crx -from ExtensionCrawler.config import (archive_file, get_local_archive_dir) +from ExtensionCrawler.config import get_local_archive_dir from ExtensionCrawler.js_decomposer import init_file_info from ExtensionCrawler.js_mincer import mince_js @@ -54,7 +54,7 @@ def is_file_with_c_style_comments(filename): def jsstrings_data(conf, path, data): """Analyze data in memory.""" - if not conf.file_pattern is None: + if conf.file_pattern is not None: if path is None: return False elif not fnmatch.fnmatch(path, conf.file_pattern): @@ -98,7 +98,7 @@ def jsstrings_data(conf, path, data): if analyze_block(conf, path, block, first): match = True first = False - if match and conf.output_decoration > 0 and conf.output_decoration < 2: + if match and 0 < conf.output_decoration < 2: print(path) return match @@ -112,6 +112,7 @@ def print_block(conf, if conf.output_decoration > 1: line_no = block.start[0] prefix = " " * (block.start[1] - 1) + # TODO: use classifier classifier = "X" sep = "=" * (len(path) + 17) if not first: @@ -129,10 +130,10 @@ def print_block(conf, path, loc, block.string_literals[0].rstrip()) print(line) else: - for (pos, str) in block.string_literals: + for (pos, string) in block.string_literals: loc = '({0[0]:d}/{0[1]:d})'.format(pos) loc = (' ' * (11 - len(loc))) + loc - line = '{0} {1} [L]: {2}'.format(path, loc, str.rstrip()) + line = '{0} {1} [L]: {2}'.format(path, loc, string.rstrip()) print(line) if code_match: print("-" * (len(path) + 17)) @@ -151,7 +152,7 @@ def analyze_block(conf, path, block, first=False): """Print code/comment blocks.""" match = False regexps = [] - if not conf.reg_exp is None: + if conf.reg_exp is not None: for regexp in conf.reg_exp: if conf.case_insensitive: regexps.append(re.compile(r'(' + regexp + ')', re.IGNORECASE)) @@ -159,7 +160,7 @@ def analyze_block(conf, path, block, first=False): regexps.append(re.compile(r'(' + regexp + ')')) if block.is_comment(): content = block.content - if not conf.reg_exp_comments is None: + if conf.reg_exp_comments is not None: for regexp in conf.reg_exp_comments: if conf.case_insensitive: regexps.append( @@ -179,14 +180,14 @@ def analyze_block(conf, path, block, first=False): content = block.content regexps_string = regexps.copy() regexps_code = regexps.copy() - if not conf.reg_exp_string_literals is None: + if conf.reg_exp_string_literals is not None: for regexp in conf.reg_exp_string_literals: if conf.case_insensitive: regexps.append( re.compile(r'(' + regexp + ')', re.IGNORECASE)) else: regexps.append(re.compile(r'(' + regexp + ')')) - if not conf.reg_exp_source is None: + if conf.reg_exp_source is not None: for regexp in conf.reg_exp_source: if conf.case_insensitive: regexps.append( @@ -222,9 +223,9 @@ def analyze_block(conf, path, block, first=False): match_idxs.add(idx) string_match = True block.string_literals = [] - for idx, str in enumerate(string_literals): + for idx, string in enumerate(string_literals): if idx in match_idxs: - block.string_literals.append(str) + block.string_literals.append(string) code_match = False for regexp in regexps_code: @@ -259,9 +260,6 @@ def analyze_crx(conf, crx, path=""): def analyze_tar(conf, tarfilename): last_crx_file = '' - # from_date - # latest_date - match = False extid = os.path.splitext(os.path.basename(tarfilename))[0] from_dateobj = None latest_dateobj = None @@ -303,7 +301,7 @@ def analyze_tar(conf, tarfilename): # both dates are given all_crx_files = all_crx( os.path.join(conf.archive_dir, "data"), extid) - if all_crx_files == []: + if not all_crx_files: logging.warning("No crx in " + extid) else: with tarfile.open(tarfilename, 'r') as archive: @@ -354,14 +352,13 @@ def analyze_task(conf, task): """Analyze one file/tar/crx/extid.""" logging.debug("Analyzing " + task) extid_re = re.compile('^[a-p]+$') - retval = False if task.endswith('.crx'): retval = analyze_crx(conf, task) elif task.endswith('.tar'): retval = analyze_tar(conf, task) elif extid_re.match(task): - tarfile = "data/" + get_local_archive_dir(task) + "/" + task + '.tar' - retval = analyze_tar(conf, conf.archive_dir + "/" + tarfile) + tarfilename = "data/" + get_local_archive_dir(task) + "/" + task + '.tar' + retval = analyze_tar(conf, conf.archive_dir + "/" + tarfilename) else: retval = analyze_file(conf, task) return retval diff --git a/extfind b/extfind index 9fb20c0..a76324d 100755 --- a/extfind +++ b/extfind @@ -21,10 +21,12 @@ import glob import os import sys import logging +import re from ExtensionCrawler import config -def help(): + +def print_help(): print("""extfind [OPTION]""") print(""" -h print this help text""") print(""" -a archive directory""") @@ -51,7 +53,7 @@ def iter_extension_paths_from_file(archive, n, N, extidlistfile): if re.fullmatch("[a-p]{32}", line) and os.path.exists(path): paths += [path] else: - logging.warn("WARNING: {} is not a valid extension path!".format(path)) + logging.warning("WARNING: {} is not a valid extension path!".format(path)) return split(paths, n, N) @@ -67,19 +69,17 @@ def main(argv): taskid = 1 maxtaskid = 1 - paths = [] - try: opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [ "archive=", "glob=", "extidlistfile=", "taskid=", "maxtaskid=", "help" ]) except getopt.GetoptError: - help() + print_help() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): - help() + print_help() sys.exit() elif opt in ("-a", "--archive"): archive = arg @@ -99,11 +99,12 @@ def main(argv): elif extidglob is not None and extidlistfile is None: paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob) else: - help() + print_help() sys.exit(2) for path in paths: print(path) + if __name__ == "__main__": main(sys.argv[1:]) diff --git a/requirements.txt b/requirements.txt index afb0a2d..06020a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ +colorama==0.3.9 +pystuck==0.8.5 simhash==1.8.0 tabulate==0.7.7 setuptools==36.2.7 cchardet==2.1.1 -mysqlclient==1.3.10 +mysqlclient==1.3.12 requests==2.18.1 pycrypto==2.6.1 beautifulsoup4==4.6.0 diff --git a/setup.py b/setup.py index 3f4e224..2eb52fd 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,12 @@ from setuptools import setup +with open('requirements.txt') as f: + requirements = f.read().splitlines() + setup( name='Extension Crawler', description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.', author='Achim D. Brucker, Michael Herzberg', license='GPL 3.0', - install_requires=['GitPython', 'pebble', 'simhash', 'colorama', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier', 'pystuck'] + install_requires=requirements )