Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

This commit is contained in:
Michael Herzberg 2017-10-06 20:13:16 +01:00
commit 2abc386f48
3 changed files with 367 additions and 197 deletions

View File

@ -202,6 +202,39 @@ def last_crx(archivedir, extid, date=None):
return last_crx return last_crx
def first_crx(archivedir, extid, date=None):
first_crx = ""
tar = os.path.join(archivedir, get_local_archive_dir(extid),
extid + ".tar")
if os.path.exists(tar):
t = tarfile.open(tar, 'r')
old_crxs = sorted([
x.name for x in t.getmembers()
if x.name.endswith(".crx") and x.size > 0 and (
date is None or (date <= dateutil.parser.parse(
os.path.split(os.path.split(x.name)[0])[1])))
])
t.close()
if old_crxs != []:
first_crx = old_crxs[0]
return first_crx
def all_crx(archivedir, extid, date=None):
tar = os.path.join(archivedir, get_local_archive_dir(extid),
extid + ".tar")
all_crxs = []
if os.path.exists(tar):
t = tarfile.open(tar, 'r')
all_crxs = sorted([
x.name for x in t.getmembers()
if x.name.endswith(".crx") and x.size > 0
])
t.close()
return all_crxs
def last_etag(archivedir, extid, crxfile): def last_etag(archivedir, extid, crxfile):
etag = "" etag = ""

View File

@ -17,17 +17,20 @@
# #
"""Tool for extracting crx file from a tar archive.""" """Tool for extracting crx file from a tar archive."""
import collections
import datetime import datetime
import getopt import argparse
import io import io
import fnmatch
import os import os
import logging import logging
import re import re
import sys import sys
import operator
import tarfile import tarfile
import zlib import zlib
from functools import partial from functools import partial, reduce
from colorama import init, Fore
from multiprocessing import Pool from multiprocessing import Pool
from zipfile import ZipFile from zipfile import ZipFile
@ -36,7 +39,7 @@ import dateutil.parser
import jsbeautifier import jsbeautifier
from ExtensionCrawler.config import (const_log_format, const_basedir) from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import get_existing_ids, last_crx from ExtensionCrawler.archive import last_crx, first_crx, all_crx
from ExtensionCrawler.config import (archive_file, get_local_archive_dir) from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
from ExtensionCrawler.js_decomposer import init_file_info from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import mince_js from ExtensionCrawler.js_mincer import mince_js
@ -44,15 +47,27 @@ from ExtensionCrawler.js_mincer import mince_js
# Script should run with python 3.4 or 3.5 # Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6) assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
JsStringsConfig = collections.namedtuple('JsStringsConfig', [
'comment', 'strings', 'group', 'program', 'beautify', 'basedir', 'regexp', def is_file_with_c_style_comments(filename):
'parallel', "verbose" """Test if filename indicates file with C-style comment."""
]) return (filename.endswith(".js") or filename.endswith(".js.gz")
or filename.endswith(".jgz") or filename.endswith(".jsg")
or filename.endswith(".css.gz") or filename.endswith(".c")
or filename.endswith(".cpp") or filename.endswith(".java"))
def jsstrings_data(path, data, config): def jsstrings_data(conf, path, data):
"""Analyze data in memory."""
if not conf.file_pattern is None:
if path is None:
return False
elif not fnmatch.fnmatch(path, conf.file_pattern):
logging.debug("Filename \'" + path + "\' does not match pattern \'"
+ conf.file_pattern + "\'")
return False
match = False match = False
print("## Analyzing " + path) logging.debug("Start analyzing " + path)
file_info = init_file_info(path, data) file_info = init_file_info(path, data)
if file_info['size'] == 0: if file_info['size'] == 0:
return match return match
@ -62,8 +77,7 @@ def jsstrings_data(path, data, config):
dec = zlib.decompressobj(zlib.MAX_WBITS | 16) dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
dec_data = dec.decompress(data, 100 * file_info['size']) dec_data = dec.decompress(data, 100 * file_info['size'])
if file_info['dec_encoding'] is None: if file_info['dec_encoding'] is None:
logging.warning("Encoding is None for " + path + logging.debug("Encoding is None for " + path + " using utf-8.")
" using utf-8.")
str_data = dec_data.decode('UTF-8') str_data = dec_data.decode('UTF-8')
else: else:
str_data = dec_data.decode(file_info['dec_encoding']) str_data = dec_data.decode(file_info['dec_encoding'])
@ -77,219 +91,342 @@ def jsstrings_data(path, data, config):
else: else:
str_data = data.decode(file_info['encoding']) str_data = data.decode(file_info['encoding'])
if config.beautify: if conf.beautify:
str_data = jsbeautifier.beautify(str_data) str_data = jsbeautifier.beautify(str_data)
with io.StringIO(str_data) as str_obj: with io.StringIO(str_data) as str_obj:
for block in mince_js( for block in mince_js(
str_obj, single_line_comments_block=config.group): str_obj,
if analyze_block(True, config.comment, config.program, single_line_comments_block=conf.group_single_line_comments):
config.strings, config.regexp, block): if analyze_block(conf, block):
match = True match = True
return match return match
def helpmsg(): def print_block(conf, block, string_match=False, code_match=False):
"""Print help message.""" print(block)
print("crx-jsstrings [OPTION] [crx-file|tar-file|ext_id] [js-file]")
print(" -h print this help text")
print(" -i ignore comments")
print(" -s strings")
print(" -g group single line comments")
print(" -c program code")
print(" -b beautify JavaScript files before analyzing them")
print(" -a=<DIR> archive directory")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
print(
" -r regexp select only comments/code/strings where regexp matches")
print(
" -d date use latest extension that was released not later than date (only for tar archives)"
)
def analyze_block(verbose, comment, program, strings, regexp, block): def analyze_block(conf, block):
"""Print code/comment blocks.""" """Print code/comment blocks."""
match = False match = False
rgx = None regexps = []
if regexp is not None: if not conf.reg_exp is None:
rgx = re.compile(regexp) for regexp in conf.reg_exp:
if comment and block.is_comment(): regexps.append(re.compile('('+regexp+')'))
if regexp is None or rgx.match(block.content): if block.is_comment():
if verbose: content = block.content
print(block) if not conf.reg_exp_comments is None:
match = True for regexp in conf.reg_exp_comments:
elif block.is_code(): regexps.append(re.compile('('+regexp+')'))
if program: for regexp in regexps:
if regexp is None or rgx.match(block.content): if regexp.search(block.content):
if verbose: if conf.colorize:
print(block) content = regexp.sub(Fore.RED + r'\1' + Fore.RESET, content)
match = True match = True
if strings:
for string in block.string_literals:
if regexp is None or rgx.match(string):
if verbose:
print(string)
match = True
return match
def analyze_crx(config, crx, path):
match = False
if path is None:
with ZipFile(crx) as crxobj:
js_files = list(
filter(
lambda x: x.filename.endswith(".js")
or x.filename.endswith(".js.gz")
or x.filename.endswith(".jgz")
or x.filename.endswith(".jsg")
or x.filename.endswith(".css.gz"),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
if jsstrings_data(path, data, config):
match = True
else:
with ZipFile(crx) as crxobj:
with crxobj.open(path) as js_file:
data = js_file.read()
match = jsstrings_data(path, data, config)
return match
def analyze_tar(config, date, path, filename):
last_crx_file = ''
match = False
extid = os.path.splitext(os.path.basename(filename))[0]
if date is not None:
dateobj = dateutil.parser.parse(date)
if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
last_crx_file = last_crx(
os.path.join(config.basedir, "data"), extid, dateobj)
else:
last_crx_file = last_crx(os.path.join(config.basedir, "data"), extid)
if last_crx_file == "" or last_crx_file is None:
print("No crx in " + extid)
else:
print("# Start analyzing " + extid)
with tarfile.open(filename, 'r') as archive:
with archive.extractfile(last_crx_file) as crx:
match = analyze_crx(config, crx, path)
if match: if match:
print("RegExp found in " + extid) block.content = content
print_block(conf, block)
elif block.is_code():
content = block.content
regexps_string = regexps.copy()
regexps_code = regexps.copy()
if not conf.reg_exp_string_literals is None:
for regexp in conf.reg_exp_string_literals:
regexps_string.append(re.compile('('+regexp+')'))
if not conf.reg_exp_source is None:
for regexp in conf.reg_exp_source:
regexps_code.append(re.compile('('+regexp+')'))
string_match = False
for regexp in regexps_string:
string_literals = block.string_literals.copy()
for idx,string in enumerate(block.string_literals):
if regexp.search(string):
if conf.colorize:
string_literals[idx] = regexp.sub(Fore.BLUE + r'\1' + Fore.RESET, string_literals[idx])
string_match = True
code_match = False
for regexp in regexps_code:
if regexp.search(block.content):
if conf.colorize:
content = regexp.sub(Fore.CYAN + r'\1' + Fore.RESET, content)
code_match = True
match = string_match or code_match
block.content = content
if match:
print_block(conf, block, string_match, code_match)
return match
def analyze_crx(conf, crx):
"""Analyze crx file."""
match = False
with ZipFile(crx) as crxobj:
js_files = list(
filter(lambda x: is_file_with_c_style_comments(x.filename),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
if jsstrings_data(conf, crx + "/" + path, data):
match = True
return match
def analyze_tar(conf, tarfilename):
last_crx_file = ''
# from_date
# latest_date
match = False
extid = os.path.splitext(os.path.basename(tarfilename))[0]
from_dateobj = None
latest_dateobj = None
if conf.from_date is not None:
from_dateobj = dateutil.parser.parse(conf.from_date)
if from_dateobj.tzinfo is None or from_dateobj.tzinfo.utcoffset(
from_dateobj) is None:
from_dateobj = from_dateobj.replace(tzinfo=datetime.timezone.utc)
if conf.latest_date is not None:
latest_dateobj = dateutil.parser.parse(conf.latest_date)
if latest_dateobj.tzinfo is None or latest_dateobj.tzinfo.utcoffset(
latest_dateobj) is None:
latest_dateobj = latest_dateobj.replace(
tzinfo=datetime.timezone.utc)
match = False
if from_dateobj is None:
last_crx_file = last_crx(
os.path.join(conf.archive_dir, "data"), extid, latest_dateobj)
if last_crx_file == "" or last_crx_file is None:
logging.warning("No crx in " + extid)
else: else:
print("RegExp not found in " + extid) with tarfile.open(tarfilename, 'r') as archive:
with archive.extractfile(last_crx_file) as crx:
match = analyze_crx(conf, crx)
def process_group(config, taskid, maxtaskid, date, path):
archive_dir = os.path.join(config.basedir, "data")
ext_ids = get_existing_ids(archive_dir)
chunksize = int(len(ext_ids) / maxtaskid)
if taskid == maxtaskid:
ext_ids = ext_ids[(taskid - 1) * chunksize:]
else: else:
ext_ids = ext_ids[(taskid - 1) * chunksize:taskid * chunksize] if latest_dateobj is None:
# only from date is given
ext_ids = list(map(partial(archive_file, archive_dir), ext_ids)) first_crx_file = first_crx(
os.path.join(conf.archive_dir, "data"), extid, from_dateobj)
with Pool(config.parallel) as p: if first_crx_file == "" or first_crx_file is None:
p.map(partial(analyze_tar, config, date, path), ext_ids) logging.warning("No crx in " + extid)
else:
with tarfile.open(tarfilename, 'r') as archive:
with archive.extractfile(first_crx_file) as crx:
match = analyze_crx(conf, crx)
else:
# both dates are given
all_crx_files = all_crx(
os.path.join(conf.archive_dir, "data"), extid)
if all_crx_files == []:
logging.warning("No crx in " + extid)
else:
with tarfile.open(tarfilename, 'r') as archive:
for crx_file in all_crx_files:
with archive.extractfile(crx_file) as crx:
match = analyze_crx(conf, crx) or match
def main(argv): def analyze_file(conf, filename):
"""Main function: JavaScript strings on steroids.""" with open(filename, 'rb') as fileobj:
config = JsStringsConfig( data = fileobj.read()
comment=True, return jsstrings_data(conf, filename, data)
strings=False,
group=False,
program=False,
beautify=False,
basedir=const_basedir(),
regexp=None,
parallel=1,
verbose=True)
filename = None
path = None def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
date = None """Function for computing list of tasks."""
taskid = -1
maxtaskid = -1
extid_re = re.compile('^[a-p]+$') extid_re = re.compile('^[a-p]+$')
tasks = []
for file_or_extid in file_or_extids:
if is_file_with_c_style_comments(file_or_extid):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.tar'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.crx'):
tasks.append(file_or_extid)
elif extid_re.match(file_or_extid):
tasks.append(file_or_extid)
else:
# default: a file with extension ide
with open(file_or_extid) as fileobj:
for line in fileobj:
line = line.strip()
if extid_re.match(line):
tasks.append(line)
try: chunksize = int(len(tasks) / maxtaskid)
opts, args = getopt.getopt(argv, "hibcd:sn:N:a:vr:", [ if taskid == maxtaskid:
"--regexp", "--date", "--archive", "--beautify" tasks = tasks[(taskid - 1) * chunksize:]
])
except getopt.GetoptError:
helpmsg()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
helpmsg()
sys.exit()
elif opt in ("-a", "--archive"):
config = config._replace(basedir=arg)
elif opt == '-i':
config = config._replace(comment=False)
elif opt == '-s':
config = config._replace(strings=True)
elif opt == '-g':
config = config._replace(group=True)
elif opt == '-c':
config = config._replace(program=True)
elif opt in ('-b', "--beautify"):
config = config._replace(beautify=True)
elif opt in ('-r', "--regexp"):
config = config._replace(regexp=arg)
elif opt in ('-d', "--date"):
date = arg
elif opt in ("-n", "--taskid"):
taskid = int(arg)
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
if len(args) == 1:
filename = args[0]
elif len(args) == 2:
filename = args[0]
path = args[1]
elif (not len(args) == 0) or taskid < 1 or maxtaskid < 1:
helpmsg()
sys.exit()
if config.verbose:
loglevel = logging.INFO
else: else:
loglevel = logging.WARNING tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]
return tasks
def analyze_task(conf, task):
"""Analyze one file/tar/crx/extid."""
logging.debug("Analyzing " + task)
extid_re = re.compile('^[a-p]+$')
retval = False
if task.endswith('.crx'):
retval = analyze_crx(conf, task)
elif task.endswith('.tar'):
retval = analyze_tar(conf, task)
elif extid_re.match(task):
retval = analyze_tar(conf, task + '.tar')
else:
retval = analyze_file(conf, task)
return retval
def main(conf):
"""Main function: JavaScript strings on steroids."""
logger = logging.getLogger() logger = logging.getLogger()
ch = logging.StreamHandler(sys.stdout) ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(logging.Formatter(const_log_format())) ch.setFormatter(logging.Formatter(const_log_format()))
logger.addHandler(ch) logger.addHandler(ch)
logger.setLevel(loglevel) if conf.verbose:
logger.setLevel(logging.DEBUG)
if taskid > 0 and maxtaskid > 0:
process_group(config, taskid, maxtaskid, date, path)
else: else:
if filename.endswith('.crx'): logger.setLevel(logging.WARNING)
analyze_crx(config, filename, path)
elif filename.endswith('.tar'): if conf.colorize:
analyze_tar(config, date, path, filename) init()
elif extid_re.match(filename):
extid = filename if conf.join_string_literals:
filename = os.path.join(config.basedir, 'data', logging.warning("Joining of string literals not yet supported!")
get_local_archive_dir(extid),
extid + ".tar") tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
analyze_tar(config, date, path, filename) with Pool(conf.parallel) as p:
else: retvals = p.map(partial(analyze_task, conf), tasks)
with open(filename, 'rb') as fileobj: return reduce(operator.or_, retvals, False)
data = fileobj.read()
jsstrings_data(filename, data, config)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) main_parser = argparse.ArgumentParser(
description=
'A combination of strings and grep for JavaScript and CSS files.')
main_parser.add_argument(
'-r',
'--reg-exp',
metavar='REGEXP',
type=str,
nargs='+',
help='search for regular expression')
main_parser.add_argument(
'-v',
'--verbose',
action='store_true',
default=False,
help='increase verbosity')
main_parser.add_argument(
'-o',
'--output-decoration',
metavar='L',
choices=[0, 1, 2, 3],
type=int,
help='show only matching files, crx, tar')
main_parser.add_argument(
'-p',
'--parallel',
metavar='P',
type=int,
help='run P threads in parallel')
main_parser.add_argument(
'-D',
'--latest-date',
metavar='DATE',
type=str,
help=
'select latest crx from tar, released before DATE. Together with --from-date, specifies all crx released in specified date range.'
)
main_parser.add_argument(
'-d',
'--from-date',
metavar='DATE',
type=str,
help=
'select oldest crx from tar released after DATE. Together with --from-date, specifies all crx released in specified date range.'
)
main_parser.add_argument(
'-f',
'--file-pattern',
metavar='pattern',
type=str,
help='process only files matching pattern')
main_parser.add_argument(
'-a',
'--archive-dir',
metavar='archive',
type=str,
default=const_basedir(),
help='archive directory')
main_parser.add_argument(
'-C', '--colorize', action='store_true', help='use colors')
main_parser.add_argument(
'-n', '--taskid', metavar='n', type=int, default=1, help='task id')
main_parser.add_argument(
'-N',
'--max-taskid',
metavar='N',
type=int,
default=1,
help='max task id')
main_parser.add_argument(
'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
comment_group = main_parser.add_argument_group('comment blocks')
comment_group.add_argument(
'-g',
'--group-single-line-comments',
help='Group consecutive singe-line comments into blocks')
comment_group.add_argument(
'-c',
'--reg-exp-comments',
metavar='REGEXP',
type=str,
nargs='+',
help='search comments for regular expression')
source_group = main_parser.add_argument_group('source blocks')
source_group.add_argument(
'-b',
'--beautify',
action='store_true',
default=False,
help='beautify source code')
source_group.add_argument(
'-s',
'--reg-exp-source',
metavar='REGEXP',
type=str,
nargs='+',
help='search source for regular expression')
strings_group = main_parser.add_argument_group('string literals')
strings_group.add_argument(
'-j',
'--join-string-literals',
action='store_true',
help='join string literals (heuristic)')
strings_group.add_argument(
'-l',
'--reg-exp-string-literals',
metavar='REGEXP',
type=str,
nargs='+',
help='search string literals for regular expression')
main_conf = main_parser.parse_args()
main(main_conf)

View File

@ -5,5 +5,5 @@ setup(
description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.', description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
author='Achim D. Brucker, Michael Herzberg', author='Achim D. Brucker, Michael Herzberg',
license='GPL 3.0', license='GPL 3.0',
install_requires=['GitPython', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier'] install_requires=['GitPython', 'colorama', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier']
) )