Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

2017-10-06 20:13:16 +01:00 · 2017-10-06 20:13:16 +01:00 · 2abc386f48
parent 6372c62336 1ee76d9817
commit 2abc386f48
3 changed files with 367 additions and 197 deletions
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@ -202,6 +202,39 @@ def last_crx(archivedir, extid, date=None):

    return last_crx

+def first_crx(archivedir, extid, date=None):
+    first_crx = ""
+    tar = os.path.join(archivedir, get_local_archive_dir(extid),
+                       extid + ".tar")
+    if os.path.exists(tar):
+        t = tarfile.open(tar, 'r')
+        old_crxs = sorted([
+            x.name for x in t.getmembers()
+            if x.name.endswith(".crx") and x.size > 0 and (
+                date is None or (date <= dateutil.parser.parse(
+                    os.path.split(os.path.split(x.name)[0])[1])))
+        ])
+        t.close()
+        if old_crxs != []:
+            first_crx = old_crxs[0]
+
+    return first_crx
+
+def all_crx(archivedir, extid, date=None):
+    tar = os.path.join(archivedir, get_local_archive_dir(extid),
+                       extid + ".tar")
+    all_crxs = []
+    if os.path.exists(tar):
+        t = tarfile.open(tar, 'r')
+        all_crxs = sorted([
+            x.name for x in t.getmembers()
+            if x.name.endswith(".crx") and x.size > 0
+        ])
+        t.close()
+    return all_crxs
+
+
+

 def last_etag(archivedir, extid, crxfile):
    etag = ""
--- a/529
+++ b/529
@ -17,17 +17,20 @@
 #
 """Tool for extracting crx file from a tar archive."""

-import collections
+
 import datetime
-import getopt
+import argparse
 import io
+import fnmatch
 import os
 import logging
 import re
 import sys
+import operator
 import tarfile
 import zlib
-from functools import partial
+from functools import partial, reduce
+from colorama import init, Fore
 from multiprocessing import Pool
 from zipfile import ZipFile

@ -36,7 +39,7 @@ import dateutil.parser
 import jsbeautifier

 from ExtensionCrawler.config import (const_log_format, const_basedir)
-from ExtensionCrawler.archive import get_existing_ids, last_crx
+from ExtensionCrawler.archive import last_crx, first_crx, all_crx
 from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
 from ExtensionCrawler.js_decomposer import init_file_info
 from ExtensionCrawler.js_mincer import mince_js
@ -44,15 +47,27 @@ from ExtensionCrawler.js_mincer import mince_js
 # Script should run with python 3.4 or 3.5
 assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)

-JsStringsConfig = collections.namedtuple('JsStringsConfig', [
-    'comment', 'strings', 'group', 'program', 'beautify', 'basedir', 'regexp',
-    'parallel', "verbose"
-])
+
+def is_file_with_c_style_comments(filename):
+    """Test if filename indicates file with C-style comment."""
+    return (filename.endswith(".js") or filename.endswith(".js.gz")
+            or filename.endswith(".jgz") or filename.endswith(".jsg")
+            or filename.endswith(".css.gz") or filename.endswith(".c")
+            or filename.endswith(".cpp") or filename.endswith(".java"))


-def jsstrings_data(path, data, config):
+def jsstrings_data(conf, path, data):
+    """Analyze data in memory."""
+    if not conf.file_pattern is None:
+        if path is None:
+            return False
+        elif not fnmatch.fnmatch(path, conf.file_pattern):
+            logging.debug("Filename \'" + path + "\' does not match pattern \'"
+                          + conf.file_pattern + "\'")
+            return False
+
    match = False
-    print("## Analyzing " + path)
+    logging.debug("Start analyzing " + path)
    file_info = init_file_info(path, data)
    if file_info['size'] == 0:
        return match
@ -62,8 +77,7 @@ def jsstrings_data(path, data, config):
            dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
            dec_data = dec.decompress(data, 100 * file_info['size'])
            if file_info['dec_encoding'] is None:
-                logging.warning("Encoding is None for " + path +
-                                " using utf-8.")
+                logging.debug("Encoding is None for " + path + " using utf-8.")
                str_data = dec_data.decode('UTF-8')
            else:
                str_data = dec_data.decode(file_info['dec_encoding'])
@ -77,219 +91,342 @@ def jsstrings_data(path, data, config):
        else:
            str_data = data.decode(file_info['encoding'])

-    if config.beautify:
+    if conf.beautify:
        str_data = jsbeautifier.beautify(str_data)

    with io.StringIO(str_data) as str_obj:
        for block in mince_js(
-                str_obj, single_line_comments_block=config.group):
-            if analyze_block(True, config.comment, config.program,
-                             config.strings, config.regexp, block):
+                str_obj,
+                single_line_comments_block=conf.group_single_line_comments):
+            if analyze_block(conf, block):
                match = True

    return match


-def helpmsg():
-    """Print help message."""
-    print("crx-jsstrings [OPTION] [crx-file|tar-file|ext_id] [js-file]")
-    print("    -h        print this help text")
-    print("    -i        ignore comments")
-    print("    -s        strings")
-    print("    -g        group single line comments")
-    print("    -c        program code")
-    print("    -b        beautify JavaScript files before analyzing them")
-    print("    -a=<DIR>  archive directory")
-    print("    -n <TASKID>         process chunk n where n in [1,N]")
-    print("    -N <MAXTASKID>      ")
-
-    print(
-        "    -r regexp select only comments/code/strings where regexp matches")
-    print(
-        "    -d date   use latest extension that was released not later than date (only for tar archives)"
-    )
+def print_block(conf, block, string_match=False, code_match=False):
+    print(block)


-def analyze_block(verbose, comment, program, strings, regexp, block):
+def analyze_block(conf, block):
    """Print code/comment blocks."""
    match = False
-    rgx = None
-    if regexp is not None:
-        rgx = re.compile(regexp)
-    if comment and block.is_comment():
-        if regexp is None or rgx.match(block.content):
-            if verbose:
-                print(block)
-            match = True
-    elif block.is_code():
-        if program:
-            if regexp is None or rgx.match(block.content):
-                if verbose:
-                    print(block)
+    regexps = []
+    if not conf.reg_exp is None:
+        for regexp in conf.reg_exp:
+            regexps.append(re.compile('('+regexp+')'))
+    if block.is_comment():
+        content = block.content
+        if not conf.reg_exp_comments is None:
+            for regexp in conf.reg_exp_comments:
+                regexps.append(re.compile('('+regexp+')'))
+        for regexp in regexps:
+            if regexp.search(block.content):
+                if conf.colorize:
+                    content = regexp.sub(Fore.RED + r'\1' + Fore.RESET, content)
                match = True
-        if strings:
-            for string in block.string_literals:
-                if regexp is None or rgx.match(string):
-                    if verbose:
-                        print(string)
-                    match = True
-    return match
-
-
-def analyze_crx(config, crx, path):
-    match = False
-    if path is None:
-        with ZipFile(crx) as crxobj:
-            js_files = list(
-                filter(
-                    lambda x: x.filename.endswith(".js") 
-                           or x.filename.endswith(".js.gz")
-                           or x.filename.endswith(".jgz")
-                           or x.filename.endswith(".jsg")
-                           or x.filename.endswith(".css.gz"),
-                    crxobj.infolist()))
-            for jsfile in js_files:
-                with crxobj.open(jsfile) as js_file_obj:
-                    data = js_file_obj.read()
-                    path = js_file_obj.name
-                if jsstrings_data(path, data, config):
-                    match = True
-    else:
-        with ZipFile(crx) as crxobj:
-            with crxobj.open(path) as js_file:
-                data = js_file.read()
-            match = jsstrings_data(path, data, config)
-    return match
-
-def analyze_tar(config, date, path, filename):
-    last_crx_file = ''
-    match = False
-    extid = os.path.splitext(os.path.basename(filename))[0]
-    if date is not None:
-        dateobj = dateutil.parser.parse(date)
-        if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
-            dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
-        last_crx_file = last_crx(
-            os.path.join(config.basedir, "data"), extid, dateobj)
-    else:
-        last_crx_file = last_crx(os.path.join(config.basedir, "data"), extid)
-    if last_crx_file == "" or last_crx_file is None:
-        print("No crx in  " + extid)
-    else:
-        print("# Start analyzing " + extid)
-        with tarfile.open(filename, 'r') as archive:
-            with archive.extractfile(last_crx_file) as crx:
-                match = analyze_crx(config, crx, path)
        if match:
-            print("RegExp found in " + extid)
+            block.content = content
+            print_block(conf, block)
+    elif block.is_code():
+        content = block.content
+        regexps_string = regexps.copy()
+        regexps_code = regexps.copy()
+        if not conf.reg_exp_string_literals is None:
+            for regexp in conf.reg_exp_string_literals:
+                regexps_string.append(re.compile('('+regexp+')'))
+        if not conf.reg_exp_source is None:
+            for regexp in conf.reg_exp_source:
+                regexps_code.append(re.compile('('+regexp+')'))
+        string_match = False
+        for regexp in regexps_string:
+            string_literals = block.string_literals.copy()
+            for idx,string in enumerate(block.string_literals):
+                if regexp.search(string):
+                    if conf.colorize:
+                        string_literals[idx] = regexp.sub(Fore.BLUE + r'\1' + Fore.RESET, string_literals[idx])
+                    string_match = True
+        code_match = False
+        for regexp in regexps_code:
+            if regexp.search(block.content):
+                if conf.colorize:
+                    content = regexp.sub(Fore.CYAN + r'\1' + Fore.RESET, content)
+                code_match = True
+        match = string_match or code_match
+        block.content = content
+        if match:
+            print_block(conf, block, string_match, code_match)
+    return match
+
+
+def analyze_crx(conf, crx):
+    """Analyze crx file."""
+    match = False
+    with ZipFile(crx) as crxobj:
+        js_files = list(
+            filter(lambda x: is_file_with_c_style_comments(x.filename),
+                   crxobj.infolist()))
+        for jsfile in js_files:
+            with crxobj.open(jsfile) as js_file_obj:
+                data = js_file_obj.read()
+                path = js_file_obj.name
+            if jsstrings_data(conf, crx + "/" + path, data):
+                match = True
+
+    return match
+
+
+def analyze_tar(conf, tarfilename):
+    last_crx_file = ''
+    #   from_date
+    #    latest_date
+    match = False
+    extid = os.path.splitext(os.path.basename(tarfilename))[0]
+    from_dateobj = None
+    latest_dateobj = None
+    if conf.from_date is not None:
+        from_dateobj = dateutil.parser.parse(conf.from_date)
+        if from_dateobj.tzinfo is None or from_dateobj.tzinfo.utcoffset(
+                from_dateobj) is None:
+            from_dateobj = from_dateobj.replace(tzinfo=datetime.timezone.utc)
+    if conf.latest_date is not None:
+        latest_dateobj = dateutil.parser.parse(conf.latest_date)
+        if latest_dateobj.tzinfo is None or latest_dateobj.tzinfo.utcoffset(
+                latest_dateobj) is None:
+            latest_dateobj = latest_dateobj.replace(
+                tzinfo=datetime.timezone.utc)
+
+    match = False
+
+    if from_dateobj is None:
+        last_crx_file = last_crx(
+            os.path.join(conf.archive_dir, "data"), extid, latest_dateobj)
+        if last_crx_file == "" or last_crx_file is None:
+            logging.warning("No crx in  " + extid)
        else:
-            print("RegExp not found in " + extid)
-
-
-def process_group(config, taskid, maxtaskid, date, path):
-    archive_dir = os.path.join(config.basedir, "data")
-    ext_ids = get_existing_ids(archive_dir)
-    chunksize = int(len(ext_ids) / maxtaskid)
-    if taskid == maxtaskid:
-        ext_ids = ext_ids[(taskid - 1) * chunksize:]
+            with tarfile.open(tarfilename, 'r') as archive:
+                with archive.extractfile(last_crx_file) as crx:
+                    match = analyze_crx(conf, crx)
    else:
-        ext_ids = ext_ids[(taskid - 1) * chunksize:taskid * chunksize]
-
-    ext_ids = list(map(partial(archive_file, archive_dir), ext_ids))
-
-    with Pool(config.parallel) as p:
-        p.map(partial(analyze_tar, config, date, path), ext_ids)
+        if latest_dateobj is None:
+            # only from date is given
+            first_crx_file = first_crx(
+                os.path.join(conf.archive_dir, "data"), extid, from_dateobj)
+            if first_crx_file == "" or first_crx_file is None:
+                logging.warning("No crx in  " + extid)
+            else:
+                with tarfile.open(tarfilename, 'r') as archive:
+                    with archive.extractfile(first_crx_file) as crx:
+                        match = analyze_crx(conf, crx)
+        else:
+            # both dates are given
+            all_crx_files = all_crx(
+                os.path.join(conf.archive_dir, "data"), extid)
+            if all_crx_files == []:
+                logging.warning("No crx in  " + extid)
+            else:
+                with tarfile.open(tarfilename, 'r') as archive:
+                    for crx_file in all_crx_files:
+                        with archive.extractfile(crx_file) as crx:
+                            match = analyze_crx(conf, crx) or match


-def main(argv):
-    """Main function: JavaScript strings on steroids."""
-    config = JsStringsConfig(
-        comment=True,
-        strings=False,
-        group=False,
-        program=False,
-        beautify=False,
-        basedir=const_basedir(),
-        regexp=None,
-        parallel=1,
-        verbose=True)
+def analyze_file(conf, filename):
+    with open(filename, 'rb') as fileobj:
+        data = fileobj.read()
+    return jsstrings_data(conf, filename, data)

-    filename = None
-    path = None
-    date = None
-    taskid = -1
-    maxtaskid = -1
+
+def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
+    """Function for computing list of tasks."""
    extid_re = re.compile('^[a-p]+$')
+    tasks = []
+    for file_or_extid in file_or_extids:
+        if is_file_with_c_style_comments(file_or_extid):
+            tasks.append(file_or_extid)
+        elif file_or_extid.endswith('.tar'):
+            tasks.append(file_or_extid)
+        elif file_or_extid.endswith('.crx'):
+            tasks.append(file_or_extid)
+        elif extid_re.match(file_or_extid):
+            tasks.append(file_or_extid)
+        else:
+            # default: a file with extension ide
+            with open(file_or_extid) as fileobj:
+                for line in fileobj:
+                    line = line.strip()
+                    if extid_re.match(line):
+                        tasks.append(line)

-    try:
-        opts, args = getopt.getopt(argv, "hibcd:sn:N:a:vr:", [
-            "--regexp", "--date", "--archive", "--beautify"
-        ])
-    except getopt.GetoptError:
-        helpmsg()
-        sys.exit(2)
-    for opt, arg in opts:
-        if opt == '-h':
-            helpmsg()
-            sys.exit()
-        elif opt in ("-a", "--archive"):
-            config = config._replace(basedir=arg)
-        elif opt == '-i':
-            config = config._replace(comment=False)
-        elif opt == '-s':
-            config = config._replace(strings=True)
-        elif opt == '-g':
-            config = config._replace(group=True)
-        elif opt == '-c':
-            config = config._replace(program=True)
-        elif opt in ('-b', "--beautify"):
-            config = config._replace(beautify=True)
-        elif opt in ('-r', "--regexp"):
-            config = config._replace(regexp=arg)
-        elif opt in ('-d', "--date"):
-            date = arg
-        elif opt in ("-n", "--taskid"):
-            taskid = int(arg)
-        elif opt in ("-N", "--maxtaskid"):
-            maxtaskid = int(arg)
-    if len(args) == 1:
-        filename = args[0]
-    elif len(args) == 2:
-        filename = args[0]
-        path = args[1]
-    elif (not len(args) == 0) or taskid < 1 or maxtaskid < 1:
-        helpmsg()
-        sys.exit()
-
-    if config.verbose:
-        loglevel = logging.INFO
+    chunksize = int(len(tasks) / maxtaskid)
+    if taskid == maxtaskid:
+        tasks = tasks[(taskid - 1) * chunksize:]
    else:
-        loglevel = logging.WARNING
+        tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]

+    return tasks
+
+
+def analyze_task(conf, task):
+    """Analyze one file/tar/crx/extid."""
+    logging.debug("Analyzing " + task)
+    extid_re = re.compile('^[a-p]+$')
+    retval = False
+    if task.endswith('.crx'):
+        retval = analyze_crx(conf, task)
+    elif task.endswith('.tar'):
+        retval = analyze_tar(conf, task)
+    elif extid_re.match(task):
+        retval = analyze_tar(conf, task + '.tar')
+    else:
+        retval = analyze_file(conf, task)
+    return retval
+
+
+def main(conf):
+    """Main function: JavaScript strings on steroids."""
    logger = logging.getLogger()
    ch = logging.StreamHandler(sys.stdout)
    ch.setFormatter(logging.Formatter(const_log_format()))
    logger.addHandler(ch)
-    logger.setLevel(loglevel)
-
-    if taskid > 0 and maxtaskid > 0:
-        process_group(config, taskid, maxtaskid, date, path)
+    if conf.verbose:
+        logger.setLevel(logging.DEBUG)
    else:
-        if filename.endswith('.crx'):
-            analyze_crx(config, filename, path)
-        elif filename.endswith('.tar'):
-            analyze_tar(config, date, path, filename)
-        elif extid_re.match(filename):
-            extid = filename
-            filename = os.path.join(config.basedir, 'data',
-                                    get_local_archive_dir(extid),
-                                    extid + ".tar")
-            analyze_tar(config, date, path, filename)
-        else:
-            with open(filename, 'rb') as fileobj:
-                data = fileobj.read()
-            jsstrings_data(filename, data, config)
+        logger.setLevel(logging.WARNING)
+
+    if conf.colorize:
+        init()
+
+    if conf.join_string_literals:
+        logging.warning("Joining of string literals not yet supported!")
+
+    tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
+    with Pool(conf.parallel) as p:
+        retvals = p.map(partial(analyze_task, conf), tasks)
+    return reduce(operator.or_, retvals, False)


 if __name__ == "__main__":
-    main(sys.argv[1:])
+    main_parser = argparse.ArgumentParser(
+        description=
+        'A combination of strings and grep for JavaScript and CSS files.')
+    main_parser.add_argument(
+        '-r',
+        '--reg-exp',
+        metavar='REGEXP',
+        type=str,
+        nargs='+',
+        help='search for regular expression')
+    main_parser.add_argument(
+        '-v',
+        '--verbose',
+        action='store_true',
+        default=False,
+        help='increase verbosity')
+
+    main_parser.add_argument(
+        '-o',
+        '--output-decoration',
+        metavar='L',
+        choices=[0, 1, 2, 3],
+        type=int,
+        help='show only matching files, crx, tar')
+    main_parser.add_argument(
+        '-p',
+        '--parallel',
+        metavar='P',
+        type=int,
+        help='run P  threads in parallel')
+
+    main_parser.add_argument(
+        '-D',
+        '--latest-date',
+        metavar='DATE',
+        type=str,
+        help=
+        'select latest crx from tar, released before DATE. Together with --from-date, specifies all crx released in specified date range.'
+    )
+
+    main_parser.add_argument(
+        '-d',
+        '--from-date',
+        metavar='DATE',
+        type=str,
+        help=
+        'select oldest crx from tar released after DATE. Together with --from-date, specifies all crx released in specified date range.'
+    )
+
+    main_parser.add_argument(
+        '-f',
+        '--file-pattern',
+        metavar='pattern',
+        type=str,
+        help='process only files matching pattern')
+
+    main_parser.add_argument(
+        '-a',
+        '--archive-dir',
+        metavar='archive',
+        type=str,
+        default=const_basedir(),
+        help='archive directory')
+    main_parser.add_argument(
+        '-C', '--colorize', action='store_true', help='use colors')
+
+    main_parser.add_argument(
+        '-n', '--taskid', metavar='n', type=int, default=1, help='task id')
+    main_parser.add_argument(
+        '-N',
+        '--max-taskid',
+        metavar='N',
+        type=int,
+        default=1,
+        help='max task id')
+
+    main_parser.add_argument(
+        'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
+
+    comment_group = main_parser.add_argument_group('comment blocks')
+    comment_group.add_argument(
+        '-g',
+        '--group-single-line-comments',
+        help='Group consecutive singe-line comments into blocks')
+    comment_group.add_argument(
+        '-c',
+        '--reg-exp-comments',
+        metavar='REGEXP',
+        type=str,
+        nargs='+',
+        help='search comments for regular expression')
+
+    source_group = main_parser.add_argument_group('source blocks')
+    source_group.add_argument(
+        '-b',
+        '--beautify',
+        action='store_true',
+        default=False,
+        help='beautify source code')
+    source_group.add_argument(
+        '-s',
+        '--reg-exp-source',
+        metavar='REGEXP',
+        type=str,
+        nargs='+',
+        help='search source for regular expression')
+
+    strings_group = main_parser.add_argument_group('string literals')
+    strings_group.add_argument(
+        '-j',
+        '--join-string-literals',
+        action='store_true',
+        help='join string literals (heuristic)')
+    strings_group.add_argument(
+        '-l',
+        '--reg-exp-string-literals',
+        metavar='REGEXP',
+        type=str,
+        nargs='+',
+        help='search string literals for regular expression')
+    main_conf = main_parser.parse_args()
+
+    main(main_conf)
--- a/setup.py
+++ b/setup.py
@ -5,5 +5,5 @@ setup(
    description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
    author='Achim D. Brucker, Michael Herzberg',
    license='GPL 3.0',
-    install_requires=['GitPython', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier']
+    install_requires=['GitPython', 'colorama', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier']
 )