Fixed style errors and warnings.

This commit is contained in:
Michael Herzberg 2018-04-21 19:00:07 +01:00
parent ac3c1c7f20
commit a789fe505f
19 changed files with 168 additions and 174 deletions

View File

@ -23,7 +23,6 @@ import os
import glob
import re
import json
from multiprocessing import Pool
from concurrent.futures import TimeoutError
from pebble import ProcessPool, ProcessExpired
from functools import partial
@ -44,6 +43,7 @@ from ExtensionCrawler.config import (
from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_warning, log_exception, setup_logger
from ExtensionCrawler.db import update_db_incremental
class Error(Exception):
pass
@ -82,11 +82,11 @@ class RequestResult:
class UpdateResult:
def __init__(self, id, is_new, exception, res_overview, res_crx,
def __init__(self, ext_id, is_new, exception, res_overview, res_crx,
res_reviews, res_support, res_sql, sql_update, worker_exception=None):
self.id = id
self.ext_id = ext_id
self.new = is_new
self.exception = exception # TODO: should be tar_exception
self.exception = exception # TODO: should be tar_exception
self.res_overview = res_overview
self.res_crx = res_crx
self.res_reviews = res_reviews
@ -188,7 +188,7 @@ def last_modified_http_date(path):
def last_crx(archivedir, extid, date=None):
last_crx = ""
last_crx_path = ""
last_crx_etag = ""
etag_file = os.path.join(archivedir, get_local_archive_dir(extid),
@ -198,14 +198,13 @@ def last_crx(archivedir, extid, date=None):
with open(etag_file, 'r') as f:
d = json.load(f)
return d["last_crx"], d["last_crx_etag"]
except Exception as e:
except Exception:
log_exception("Something was wrong with the etag file {}, deleting it ...".format(etag_file))
try:
os.remove(etag_file)
except Exception as e:
except Exception:
log_exception("Could not remove etag file {}!".format(etag_file))
# If we do not yet have an .etag file present, open the tarfile and look
# there for one. After having done that once, the crawler creates the .etag
# file to avoid opening the tar file in the future.
@ -219,23 +218,23 @@ def last_crx(archivedir, extid, date=None):
date is None or (dateutil.parser.parse(
os.path.split(os.path.split(x.name)[0])[1]) <= date))
])
if old_crxs != []:
last_crx = old_crxs[-1]
if old_crxs:
last_crx_path = old_crxs[-1]
headers_content = t.extractfile(
last_crx + ".headers").read().decode().replace(
last_crx_path + ".headers").read().decode().replace(
'"', '\\"').replace("'", '"')
headers_json = json.loads(headers_content)
last_crx_etag = headers_json["ETag"]
if date is None:
with open(etag_file, 'w') as f:
json.dump({"last_crx": last_crx, "last_crx_etag": last_crx_etag}, f)
json.dump({"last_crx": last_crx_path, "last_crx_etag": last_crx_etag}, f)
return last_crx, last_crx_etag
return last_crx_path, last_crx_etag
def first_crx(archivedir, extid, date=None):
first_crx = ""
first_crx_path = ""
tar = os.path.join(archivedir, get_local_archive_dir(extid),
extid + ".tar")
if os.path.exists(tar):
@ -247,10 +246,10 @@ def first_crx(archivedir, extid, date=None):
os.path.split(os.path.split(x.name)[0])[1])))
])
t.close()
if old_crxs != []:
first_crx = old_crxs[0]
if old_crxs:
first_crx_path = old_crxs[0]
return first_crx
return first_crx_path
def all_crx(archivedir, extid, date=None):
@ -283,7 +282,7 @@ def update_overview(tar, date, ext_id):
def validate_crx_response(res, extid, extfilename):
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
if not 'Content-Type' in res.headers:
if 'Content-Type' not in res.headers:
raise CrawlError(extid, 'Did not find Content-Type header.', '\n'.join(
res.iter_lines()))
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
@ -351,10 +350,12 @@ def update_crx(archivedir, tmptardir, ext_id, date):
f.write(chunk)
write_text(tmptardir, date, extfilename + ".etag",
res.headers.get("ETag"))
etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id),
ext_id + ".etag")
etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id + ".etag")
with open(etag_file, 'w') as f:
json.dump({"last_crx": os.path.join(ext_id, date, extfilename), "last_crx_etag": res.headers.get("ETag")}, f)
json.dump({
"last_crx": os.path.join(ext_id, date, extfilename),
"last_crx_etag": res.headers.get("ETag")
}, f)
except Exception as e:
log_exception("Exception when updating crx", 3, ext_id)
write_text(tmptardir, date, extfilename + ".exception",
@ -367,9 +368,10 @@ def iterate_authors(pages):
for page in pages:
json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
for annotation in json_page["annotations"]:
if "attributes" in annotation and "replyExists" in annotation["attributes"] and annotation["attributes"]["replyExists"]:
yield (annotation["entity"]["author"],
annotation["entity"]["groups"])
if "attributes" in annotation:
if "replyExists" in annotation["attributes"]:
if annotation["attributes"]["replyExists"]:
yield (annotation["entity"]["author"], annotation["entity"]["groups"])
def update_reviews(tar, date, ext_id):
@ -550,7 +552,7 @@ def update_extension(archivedir, forums, ext_id):
try:
write_text(tardir, date, ext_id + ".sql.exception",
traceback.format_exc())
except Exception as e:
except Exception:
pass
try:
shutil.rmtree(path=tmpdir)
@ -581,13 +583,11 @@ def init_process(verbose, start_pystuck=False):
pystuck.run_server(port=((os.getpid() % 10000) + 10001))
def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
results=[]
with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process, initargs=(verbose, start_pystuck)) as pool:
future = pool.map(partial(update_extension, archivedir, forums),
ext_ids,
chunksize=1,
timeout=timeout)
def execute_parallel(archivedir, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
results = []
with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process,
initargs=(verbose, start_pystuck)) as pool:
future = pool.map(partial(update_extension, archivedir, forums), ext_ids, chunksize=1, timeout=timeout)
iterator = future.result()
for ext_id in ext_ids:
try:
@ -595,7 +595,7 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum
except StopIteration:
break
except TimeoutError as error:
log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id,error.args[1]))
log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id, error.args[1]))
results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error))
except ProcessExpired as error:
log_warning("WorkerException: %s (%s), exit code: %d" % (error, ext_id, error.exitcode))
@ -609,8 +609,6 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum
def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, verbose, start_pystuck):
ext_with_forums = []
ext_without_forums = []
forums_ext_ids = (list(set(forums_ext_ids)))
log_info("Updating {} extensions ({} including forums)".format(
@ -621,13 +619,13 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, ve
parallel_ids = ext_ids
log_info("Updating {} extensions excluding forums (parallel)".format(
len(parallel_ids)), 1)
ext_without_forums = execute_parallel(archivedir, 3, timeout, parallel, parallel_ids, False, verbose, start_pystuck)
ext_without_forums = execute_parallel(archivedir, timeout, parallel, parallel_ids, False, verbose, start_pystuck)
# Second, update extensions with forums sequentially (and with delays) to
# avoid running into Googles DDOS detection.
log_info("Updating {} extensions including forums (sequentially)".format(
len(forums_ext_ids)), 1)
ext_with_forums = execute_parallel(archivedir, 3, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)
ext_with_forums = execute_parallel(archivedir, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)
return ext_with_forums + ext_without_forums

View File

@ -169,10 +169,10 @@ def update_lib(force, archive, lib):
outphased = []
for lib_ver in local_lib_json['assets']:
version = lib_ver['version']
if not version in cdnjs_versions:
if version not in cdnjs_versions:
logging.warning("Found outphased versions for " + name + " " +
str(version) + " , preserving from archive.")
if not 'outphased' in lib_ver:
if 'outphased' not in lib_ver:
lib_ver[
'outphased'] = datetime.datetime.utcnow().isoformat()
outphased.append(lib_ver)
@ -260,7 +260,7 @@ def delete_orphaned(archive, local_libs, cdnjs_current_libs):
"""Delete all orphaned local libaries."""
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
for lib in local_libs:
if not lib in cdnjs_current_libs:
if lib not in cdnjs_current_libs:
os.remove(os.path.join(dirname, lib + ".json"))

View File

@ -25,8 +25,7 @@ import logging
import os
import re
import sys
from functools import partial, reduce
from multiprocessing import Pool
from functools import reduce
import dateutil.parser
import git
@ -70,8 +69,8 @@ def pull_list_changed_files(git_path):
for diff in single_fetch_info.commit.diff(
single_fetch_info.old_commit):
logging.debug("Found diff: " + str(diff))
if not diff.a_blob is None:
if not diff.a_blob.path in files:
if diff.a_blob is not None:
if diff.a_blob.path not in files:
files.append(diff.a_blob.path)
return files
@ -98,7 +97,7 @@ def hackish_pull_list_changed_files(git_path):
for line in pull_lines:
match = re.search(r'^ (.+) \| .*$', line)
if not match is None:
if match is not None:
changed_files = match.group(1).split('=>')
for changed_file in changed_files:
files.add(changed_file.strip())
@ -139,6 +138,7 @@ def get_file_libinfo(release_dic, git_path, libfile):
file_info['library'] = lib
file_info['version'] = version
file_info['add_date'] = release_dic[(lib, version)]
# TODO: why is package not used?
package = os.path.join(
reduce(os.path.join, plist[:idx + 1]), "package.json")
return file_info
@ -167,7 +167,7 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
libvers = set()
files = []
versionidx = len(path_to_list(cdnjs_git_path)) + 4
if not localpath is None:
if localpath is not None:
paths = os.path.join(cdnjs_git_path, localpath)
else:
paths = os.path.join(cdnjs_git_path, 'ajax/libs/**/*')
@ -196,7 +196,7 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
if os.path.isfile(filename):
logging.info("Updating database for file " + filename)
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
if not file_info is None:
if file_info is not None:
if create_csv:
print(file_info['path'])
print(cdnjs_git_path)
@ -268,7 +268,7 @@ def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
retries = 0
success = False
max_retries = 4
while (not success and (retries < max_retries)):
while not success and (retries < max_retries):
try:
update_database_for_file_chunked_timeout(create_csv, release_dic,
cdnjs_git_path, filenames)
@ -305,7 +305,7 @@ def get_release_triple(git_path, libver):
lib = plist[-2]
date = get_add_date(git_path, libver)
logging.info("Release information:" + lib + " " + ver + ": " + str(date))
return (lib, ver, date)
return lib, ver, date
def build_release_date_dic(git_path, libvers):
@ -332,7 +332,6 @@ def pull_and_update_db(cdnjs_git_path, create_csv):
def update_db_from_listfile(cdnjs_git_path, listfile, create_csv):
"""Update database (without pull) for files in listfile)"""
paths = []
with open(listfile) as listfileobj:
paths = listfileobj.read().splitlines()
files = []

View File

@ -142,10 +142,12 @@ def const_verbose():
"""Default verbosity."""
return True
def const_use_process_pool():
"""Use ProcessPool (from module 'pebble') for concurrency."""
return False
def const_log_format():
return '%(process)6s %(asctime)s %(levelname)8s %(message)s'
@ -154,14 +156,17 @@ def const_discover():
"""Default configuration of discovery mode"""
return False
def const_download_ext_ids_with_forums():
"""Download extensions with forums (sequential mode)"""
return True
def const_download_ext_ids_without_forums():
"""Download extensions without forums (parallel mode)"""
return True
def const_ext_timeout():
"""Timeout for downloading an individual extension (2 hours)."""
return 2*60*60

View File

@ -15,13 +15,12 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
from ExtensionCrawler.crx import *
from ExtensionCrawler.archive import *
from ExtensionCrawler.js_decomposer import decompose_js_with_connection, DetectionType, FileClassification
from ExtensionCrawler.config import const_mysql_config_file
from ExtensionCrawler.crx import read_crx
from ExtensionCrawler.js_decomposer import decompose_js_with_connection
from ExtensionCrawler.util import log_warning, log_debug, log_exception, log_info
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend, convert_date
import re
from bs4 import BeautifulSoup
@ -63,7 +62,7 @@ def get_etag(ext_id, datepath, con):
link = f.read()
linked_date = link[3:].split("/")[0]
result = con.get_etag(ext_id, con.convert_date(linked_date))
result = con.get_etag(ext_id, convert_date(linked_date))
if result is not None:
return result
@ -166,7 +165,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
con.insert(
"extension",
extid=ext_id,
date=con.convert_date(date),
date=convert_date(date),
name=name,
version=version,
description=description,
@ -184,12 +183,12 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
con.insert(
"category",
extid=ext_id,
date=con.convert_date(date),
date=convert_date(date),
category_md5=hashlib.md5(category.encode()).digest(),
category=category)
def parse_and_insert_crx(ext_id, date, datepath, con):
def parse_and_insert_crx(ext_id, datepath, con):
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
if not crx_path:
return
@ -314,7 +313,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con):
con.insert(
"review",
extid=ext_id,
date=con.convert_date(date),
date=convert_date(date),
commentdate=datetime.datetime.utcfromtimestamp(
get(review, "timestamp")).isoformat()
if "timestamp" in review else None,
@ -345,7 +344,7 @@ def parse_and_insert_support(ext_id, date, supportpath, con):
con.insert(
"support",
extid=ext_id,
date=con.convert_date(date),
date=convert_date(date),
commentdate=datetime.datetime.utcfromtimestamp(
get(review, "timestamp")).isoformat()
if "timestamp" in review else None,
@ -365,7 +364,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
log_debug("- parsing reply file", 3, ext_id)
with open(repliespath) as f:
d = json.load(f)
if not "searchResults" in d:
if "searchResults" not in d:
log_warning("* WARNING: there are no search results in {}".format(
repliespath), 3, ext_id)
return
@ -379,7 +378,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
con.insert(
"reply",
extid=ext_id,
date=con.convert_date(date),
date=convert_date(date),
commentdate=datetime.datetime.utcfromtimestamp(
get(annotation, "timestamp")).isoformat()
if "timestamp" in annotation else None,
@ -413,7 +412,7 @@ def parse_and_insert_status(ext_id, date, datepath, con):
con.insert(
"status",
extid=ext_id,
date=con.convert_date(date),
date=convert_date(date),
crx_status=crx_status,
overview_status=overview_status,
overview_exception=overview_exception)
@ -439,8 +438,8 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):
if etag:
try:
parse_and_insert_crx(ext_id, date, datepath, con)
except Exception as e:
parse_and_insert_crx(ext_id, datepath, con)
except Exception:
log_exception("Exception when parsing crx", 3, ext_id)
else:
crx_status = get_crx_status(datepath)
@ -449,40 +448,40 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):
try:
parse_and_insert_overview(ext_id, date, datepath, con)
except Exception as e:
except Exception:
log_exception("Exception when parsing overview", 3, ext_id)
try:
parse_and_insert_status(ext_id, date, datepath, con)
except Exception as e:
except Exception:
log_exception("Exception when parsing status", 3, ext_id)
reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
for reviewpath in reviewpaths:
try:
parse_and_insert_review(ext_id, date, reviewpath, con)
except json.decoder.JSONDecodeError as e:
except json.decoder.JSONDecodeError:
log_warning("- WARNING: Review is not a proper json file!", 3,
ext_id)
except Exception as e:
except Exception:
log_exception("Exception when parsing review", 3, ext_id)
supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
for supportpath in supportpaths:
try:
parse_and_insert_support(ext_id, date, supportpath, con)
except json.decoder.JSONDecodeError as e:
except json.decoder.JSONDecodeError:
log_warning("- WARNING: Support is not a proper json file!", 3,
ext_id)
except Exception as e:
except Exception:
log_exception("Exception when parsing support", 3, ext_id)
repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
for repliespath in repliespaths:
try:
parse_and_insert_replies(ext_id, date, repliespath, con)
except json.decoder.JSONDecodeError as e:
except json.decoder.JSONDecodeError:
log_warning("- WARNING: Reply is not a proper json file!", 3,
ext_id)
except Exception as e:
except Exception:
log_exception("Exception when parsing reply", 3, ext_id)

View File

@ -18,18 +18,17 @@
import time
import datetime
from random import uniform
from itertools import starmap
import logging
import MySQLdb
import _mysql_exceptions
import ExtensionCrawler.config as config
from ExtensionCrawler.util import log_info, log_error, log_exception, log_warning
from ExtensionCrawler.util import log_info, log_error, log_warning
class MysqlBackend:
def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(), **kwargs):
def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(),
**kwargs):
self.ext_id = ext_id
self.dbargs = kwargs
self.try_wait = try_wait
@ -147,5 +146,6 @@ class MysqlBackend:
result = self.retry(lambda: self.cursor.fetchone())
return result
def convert_date(self, date):
return date[:-6]
def convert_date(date):
return date[:-6]

View File

@ -17,12 +17,10 @@
"""Python mnodule providing methods for discovering extensions in the
Chrome extension store."""
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import fromstring
import re
from functools import reduce
import requests
from ExtensionCrawler import config
from ExtensionCrawler.util import log_info, log_exception
def crawl_nearly_all_of_ext_ids():
@ -30,7 +28,7 @@ def crawl_nearly_all_of_ext_ids():
def get_inner_elems(doc):
"""Get inner element."""
return ET.fromstring(doc).iterfind(r".//{{{}}}loc".format(
return fromstring(doc).iterfind(r".//{{{}}}loc".format(
config.const_sitemap_scheme()))
def is_generic_url(url):

View File

@ -30,12 +30,14 @@ import magic
from ExtensionCrawler.js_mincer import mince_js
def is_binary_resource(mimetype_magic):
return (mimetype_magic.startswith("image/") or
mimetype_magic.startswith("video/") or
mimetype_magic.startswith("audio/") or
mimetype_magic == "application/pdf")
def normalize_jsdata(str_data):
"""Compute normalized code blocks of a JavaScript file"""
txt = ""
@ -59,9 +61,8 @@ def get_features(s):
def get_simhash(encoding, data):
"""Compute simhash of text."""
str_data = ""
if not encoding is None:
str_data = data.decode(encoding=encoding,errors="replace")
if encoding is not None:
str_data = data.decode(encoding=encoding, errors="replace")
else:
str_data = str(data)
simhash = Simhash(get_features(str_data)).value
@ -82,31 +83,30 @@ def compute_difference(hx, hy):
def get_data_identifiers(data):
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
data_identifier = {}
data_identifier['encoding'] = None
data_identifier['description'] = None
data_identifier['size'] = None
data_identifier['loc'] = None
data_identifier['mimetype_magic'] = None
data_identifier['md5'] = None
data_identifier['sha1'] = None
data_identifier['sha256'] = None
data_identifier['simhash'] = None
data_identifier['size_stripped'] = None
data_identifier['normalized_encoding'] = None
data_identifier['normalized_description'] = None
data_identifier['normalized_size'] = None
data_identifier['normalized_loc'] = None
data_identifier['normalized_mimetype_magic'] = None
data_identifier['normalized_md5'] = None
data_identifier['normalized_sha1'] = None
data_identifier['normalized_sha256'] = None
data_identifier['normalized_simhash'] = None
data_identifier = {
'encoding': None,
'description': None,
'size': None,
'loc': None,
'mimetype_magic': None,
'md5': None,
'sha1': None,
'sha256': None,
'simhash': None,
'size_stripped': None,
'normalized_encoding': None,
'normalized_description': None,
'normalized_size': None,
'normalized_loc': None,
'normalized_mimetype_magic': None,
'normalized_md5': None,
'normalized_sha1': None,
'normalized_sha256': None,
'normalized_simhash': None
}
mimetype_magic = magic.from_buffer(data, mime=True)
magic_desc = ""
try:
magic_desc = magic.from_buffer(data)
except magic.MagicException as exp:
@ -137,9 +137,10 @@ def get_data_identifiers(data):
data_identifier['encoding'] = encoding
try:
normalized_data, normalized_loc = normalize_jsdata(
data.decode(encoding=data_identifier['encoding'],errors="replace"))
data.decode(encoding=data_identifier['encoding'], errors="replace"))
except Exception:
normalized_data = None
normalized_loc = 0
if normalized_data is not None:
normalized_magic_desc = ""
@ -149,7 +150,7 @@ def get_data_identifiers(data):
rgx = re.compile(r' name use count.*$')
msg = str(exp.message)
if re.search(rgx, msg):
magic_desc = re.sub(rgx, '', msg)
normalized_magic_desc = re.sub(rgx, '', msg)
else:
raise exp
normalized_encoding = chardet.detect(normalized_data)['encoding']

View File

@ -18,12 +18,10 @@
general and Chrome extensions in particular."""
import os
import io
from io import StringIO
import re
import json
import zlib
import logging
from enum import Enum
from ExtensionCrawler.js_mincer import mince_js
from ExtensionCrawler.file_identifiers import get_file_identifiers, is_binary_resource
@ -107,15 +105,15 @@ def unknown_lib_identifiers():
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
), # MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
), # MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
re.compile(
r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
), # MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
re.compile(
r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
re.IGNORECASE),
@ -188,13 +186,9 @@ def check_md5_decompressed(con, file_info):
"""Check for known md5 hash (decompressed file content)."""
if con is None:
return file_info
if file_info['dec_md5'] is None:
return file_info
else:
if file_info['dec_md5'] is not None:
libver = con.get_cdnjs_info(file_info['dec_md5'])
if libver is None:
return file_info
else:
if libver is not None:
file_info['lib'] = libver[0]
file_info['version'] = libver[1]
file_info['lib_filename'] = libver[2]
@ -203,7 +197,6 @@ def check_md5_decompressed(con, file_info):
else:
file_info['type'] = FileClassification.LIBRARY
file_info['detectionMethod'] = DetectionType.MD5_DECOMPRESSED
return file_info
return file_info
@ -361,7 +354,7 @@ def analyse_comment_known_libs(zipfile, js_file, js_info, comment):
else:
filename = js_file
for lib, regex in load_lib_identifiers().items():
if ('filecontent' in regex):
if 'filecontent' in regex:
for unkregex in regex['filecontent']:
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
@ -481,13 +474,14 @@ def decompose_js_with_connection(path_or_zipfileobj, con):
try:
str_data = data.decode(file_info['encoding'])
except Exception:
log_info("Exception during data decoding for entry " +
file_info['filename'], 3)
log_info("Exception during data decoding for entry " + file_info['filename'], 3)
str_data = ''
else:
str_data = ''
info_data_blocks = check_data_blocks(file_info, str_data)
else:
info_data_blocks = None
if info_data_blocks:
inventory = inventory + merge_filename_and_data_info(

View File

@ -198,8 +198,8 @@ def mince_js_fileobj(fileobj):
except StopIteration:
pass
if ((is_comment(state) and is_code_or_string_literal(suc_state)) or
(is_code_or_string_literal(state) and is_comment(suc_state))):
if ((is_comment(state) and is_code_or_string_literal(suc_state)) or (
is_code_or_string_literal(state) and is_comment(suc_state))):
if content.strip():
yield (JsBlock(state, (block_start_line, block_start_cpos),
(line, cpos), content, string_literals))

View File

@ -107,7 +107,7 @@ def main(argv):
logging.info("Starting update of new db libs")
pull_and_update_db(cdnjs_git_path, csv)
logging.info("Finished update of new db libs")
if not listfile is None:
if listfile is not None:
logging.info("Starting update from list file")
update_db_from_listfile(cdnjs_git_path, listfile, csv)
logging.info("Finished update from list file")

14
crawler
View File

@ -19,7 +19,6 @@
A crawler for extensions from the Chrome Web Store.
"""
import os
import sys
import datetime
import time
@ -141,7 +140,7 @@ def log_summary(res, runtime=0):
log_info(" Total runtime: {}".format(
str(datetime.timedelta(seconds=int(runtime)))))
if corrupt_tar_archives != []:
if corrupt_tar_archives:
log_info("")
log_info("List of extensions with corrupted files/archives:")
list(
@ -229,7 +228,8 @@ def parse_args(argv):
max_discover = int(arg)
elif opt == '--pystuck':
start_pystuck = True
return basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck
return [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums,
download_ext_ids_without_forums, ext_timeout, start_pystuck]
def main(argv):
@ -242,8 +242,8 @@ def main(argv):
multiprocessing.set_start_method("forkserver")
today = datetime.datetime.now(datetime.timezone.utc).isoformat()
basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck = parse_args(
argv)
[basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums,
ext_timeout, start_pystuck] = parse_args(argv)
setup_logger(verbose)
@ -304,7 +304,7 @@ def main(argv):
# We re-try (once) the extensions with unknown exceptions, as
# they are often temporary
has_exception = list(filter(lambda x: x.has_exception(), res))
if has_exception != []:
if has_exception:
log_info(
" {} extensions with unknown exceptions, start another try ...".
format(str(len(has_exception))))
@ -318,7 +318,7 @@ def main(argv):
res = list(set(res) - set(has_exception)) + res_update
end_time = time.time()
log_summary(res, end_time - start_time)
log_summary(res, int(end_time - start_time))
log_failures_to_file(log_dir, today, res)

View File

@ -17,7 +17,6 @@
#
import getopt
import os
import sys
import tarfile
import time
@ -30,12 +29,12 @@ import datetime
from ExtensionCrawler.archive import update_db_incremental
from ExtensionCrawler.config import *
from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception
from ExtensionCrawler.util import log_info, log_exception
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
def help():
def print_help():
print("""create-db [OPTION]""")
print(""" -h print this help text""")
print(""" -a <DIR> archive directory""")
@ -122,11 +121,11 @@ def parse_args(argv):
"maxtaskid=", "from-date=", "until-date=", "help"
])
except getopt.GetoptError:
help()
print_help()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
help()
print_help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
@ -140,12 +139,12 @@ def parse_args(argv):
taskid = int(arg)
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
elif opt in ("--from-date"):
elif opt == "--from-date":
from_date = arg
elif opt in ("--until-date"):
elif opt == "--until-date":
until_date = arg
if paths == []:
if not paths:
paths = list(find(archive, "*"))
chunksize = int(len(paths) / maxtaskid)

View File

@ -58,7 +58,6 @@ def main(argv):
basedir = const_basedir()
verbose = True
date = None
extid = ""
useetag = False
output = ""
winfs = False

View File

@ -44,7 +44,6 @@ def main(argv):
"""Main function of the extension crawler."""
verbose = False
silent = False
filename = None
csvfile = None
database = True
try:

View File

@ -39,7 +39,7 @@ import jsbeautifier
from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import last_crx, first_crx, all_crx
from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
from ExtensionCrawler.config import get_local_archive_dir
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import mince_js
@ -54,7 +54,7 @@ def is_file_with_c_style_comments(filename):
def jsstrings_data(conf, path, data):
"""Analyze data in memory."""
if not conf.file_pattern is None:
if conf.file_pattern is not None:
if path is None:
return False
elif not fnmatch.fnmatch(path, conf.file_pattern):
@ -98,7 +98,7 @@ def jsstrings_data(conf, path, data):
if analyze_block(conf, path, block, first):
match = True
first = False
if match and conf.output_decoration > 0 and conf.output_decoration < 2:
if match and 0 < conf.output_decoration < 2:
print(path)
return match
@ -112,6 +112,7 @@ def print_block(conf,
if conf.output_decoration > 1:
line_no = block.start[0]
prefix = " " * (block.start[1] - 1)
# TODO: use classifier
classifier = "X"
sep = "=" * (len(path) + 17)
if not first:
@ -129,10 +130,10 @@ def print_block(conf,
path, loc, block.string_literals[0].rstrip())
print(line)
else:
for (pos, str) in block.string_literals:
for (pos, string) in block.string_literals:
loc = '({0[0]:d}/{0[1]:d})'.format(pos)
loc = (' ' * (11 - len(loc))) + loc
line = '{0} {1} [L]: {2}'.format(path, loc, str.rstrip())
line = '{0} {1} [L]: {2}'.format(path, loc, string.rstrip())
print(line)
if code_match:
print("-" * (len(path) + 17))
@ -151,7 +152,7 @@ def analyze_block(conf, path, block, first=False):
"""Print code/comment blocks."""
match = False
regexps = []
if not conf.reg_exp is None:
if conf.reg_exp is not None:
for regexp in conf.reg_exp:
if conf.case_insensitive:
regexps.append(re.compile(r'(' + regexp + ')', re.IGNORECASE))
@ -159,7 +160,7 @@ def analyze_block(conf, path, block, first=False):
regexps.append(re.compile(r'(' + regexp + ')'))
if block.is_comment():
content = block.content
if not conf.reg_exp_comments is None:
if conf.reg_exp_comments is not None:
for regexp in conf.reg_exp_comments:
if conf.case_insensitive:
regexps.append(
@ -179,14 +180,14 @@ def analyze_block(conf, path, block, first=False):
content = block.content
regexps_string = regexps.copy()
regexps_code = regexps.copy()
if not conf.reg_exp_string_literals is None:
if conf.reg_exp_string_literals is not None:
for regexp in conf.reg_exp_string_literals:
if conf.case_insensitive:
regexps.append(
re.compile(r'(' + regexp + ')', re.IGNORECASE))
else:
regexps.append(re.compile(r'(' + regexp + ')'))
if not conf.reg_exp_source is None:
if conf.reg_exp_source is not None:
for regexp in conf.reg_exp_source:
if conf.case_insensitive:
regexps.append(
@ -222,9 +223,9 @@ def analyze_block(conf, path, block, first=False):
match_idxs.add(idx)
string_match = True
block.string_literals = []
for idx, str in enumerate(string_literals):
for idx, string in enumerate(string_literals):
if idx in match_idxs:
block.string_literals.append(str)
block.string_literals.append(string)
code_match = False
for regexp in regexps_code:
@ -259,9 +260,6 @@ def analyze_crx(conf, crx, path=""):
def analyze_tar(conf, tarfilename):
last_crx_file = ''
# from_date
# latest_date
match = False
extid = os.path.splitext(os.path.basename(tarfilename))[0]
from_dateobj = None
latest_dateobj = None
@ -303,7 +301,7 @@ def analyze_tar(conf, tarfilename):
# both dates are given
all_crx_files = all_crx(
os.path.join(conf.archive_dir, "data"), extid)
if all_crx_files == []:
if not all_crx_files:
logging.warning("No crx in " + extid)
else:
with tarfile.open(tarfilename, 'r') as archive:
@ -354,14 +352,13 @@ def analyze_task(conf, task):
"""Analyze one file/tar/crx/extid."""
logging.debug("Analyzing " + task)
extid_re = re.compile('^[a-p]+$')
retval = False
if task.endswith('.crx'):
retval = analyze_crx(conf, task)
elif task.endswith('.tar'):
retval = analyze_tar(conf, task)
elif extid_re.match(task):
tarfile = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
retval = analyze_tar(conf, conf.archive_dir + "/" + tarfile)
tarfilename = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
retval = analyze_tar(conf, conf.archive_dir + "/" + tarfilename)
else:
retval = analyze_file(conf, task)
return retval

15
extfind
View File

@ -21,10 +21,12 @@ import glob
import os
import sys
import logging
import re
from ExtensionCrawler import config
def help():
def print_help():
print("""extfind [OPTION]""")
print(""" -h print this help text""")
print(""" -a <DIR> archive directory""")
@ -51,7 +53,7 @@ def iter_extension_paths_from_file(archive, n, N, extidlistfile):
if re.fullmatch("[a-p]{32}", line) and os.path.exists(path):
paths += [path]
else:
logging.warn("WARNING: {} is not a valid extension path!".format(path))
logging.warning("WARNING: {} is not a valid extension path!".format(path))
return split(paths, n, N)
@ -67,19 +69,17 @@ def main(argv):
taskid = 1
maxtaskid = 1
paths = []
try:
opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [
"archive=", "glob=", "extidlistfile=", "taskid=",
"maxtaskid=", "help"
])
except getopt.GetoptError:
help()
print_help()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
help()
print_help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
@ -99,11 +99,12 @@ def main(argv):
elif extidglob is not None and extidlistfile is None:
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
else:
help()
print_help()
sys.exit(2)
for path in paths:
print(path)
if __name__ == "__main__":
main(sys.argv[1:])

View File

@ -1,8 +1,10 @@
colorama==0.3.9
pystuck==0.8.5
simhash==1.8.0
tabulate==0.7.7
setuptools==36.2.7
cchardet==2.1.1
mysqlclient==1.3.10
mysqlclient==1.3.12
requests==2.18.1
pycrypto==2.6.1
beautifulsoup4==4.6.0

View File

@ -1,9 +1,12 @@
from setuptools import setup
with open('requirements.txt') as f:
requirements = f.read().splitlines()
setup(
name='Extension Crawler',
description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
author='Achim D. Brucker, Michael Herzberg',
license='GPL 3.0',
install_requires=['GitPython', 'pebble', 'simhash', 'colorama', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier', 'pystuck']
install_requires=requirements
)