Restored basic functionalty for single files.

2017-10-06 15:47:59 +01:00 · 2017-10-06 15:47:59 +01:00 · 5dde0a79c4
parent ff8023073f
commit 5dde0a79c4
1 changed files with 96 additions and 84 deletions
--- a/180
+++ b/180
@ -17,17 +17,18 @@
 #
 """Tool for extracting crx file from a tar archive."""

-import collections
 import datetime
 import argparse
 import io
+import fnmatch
 import os
 import logging
 import re
 import sys
+import operator
 import tarfile
 import zlib
-from functools import partial
+from functools import partial, reduce
 from multiprocessing import Pool
 from zipfile import ZipFile

@ -45,9 +46,17 @@ from ExtensionCrawler.js_mincer import mince_js
 assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)


-def jsstrings_data(path, data, args):
+def jsstrings_data(conf, path, data):
+    """Analyze data in memory."""
+    if not conf.file_pattern is None:
+        if path is None: 
+            return False
+        elif not fnmatch.fnmatch(path, conf.file_pattern):
+            logging.debug("Filename \'" + path + "\' does not match pattern \'" + conf.file_pattern + "\'")
+            return False
+
    match = False
-    print("## Analyzing " + path)
+    logging.debug("Start analyzing " + path)
    file_info = init_file_info(path, data)
    if file_info['size'] == 0:
        return match
@ -57,7 +66,7 @@ def jsstrings_data(path, data, args):
            dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
            dec_data = dec.decompress(data, 100 * file_info['size'])
            if file_info['dec_encoding'] is None:
-                logging.warning("Encoding is None for " + path +
+                logging.debug("Encoding is None for " + path +
                                " using utf-8.")
                str_data = dec_data.decode('UTF-8')
            else:
@ -72,49 +81,61 @@ def jsstrings_data(path, data, args):
        else:
            str_data = data.decode(file_info['encoding'])

-    if args.beautify:
+    if conf.beautify:
        str_data = jsbeautifier.beautify(str_data)

    with io.StringIO(str_data) as str_obj:
        for block in mince_js(
                str_obj,
-                single_line_comments_block=args.group_single_line_comments):
-            if analyze_block(args, block):
+                single_line_comments_block=conf.group_single_line_comments):
+            if analyze_block(conf, block):
                match = True

    return match

+def print_block(conf, block, string_match = False, code_match = False):
+    print(block)

-def analyze_block(args, block):
+def analyze_block(conf, block):
    """Print code/comment blocks."""
    match = False
-    rgx = None
-    if args.reg_exp is not None:
-        rgx = re.compile(args.reg_exp)
+    regexps = []
+    if not conf.reg_exp is None:
+        for regexp in conf.reg_exp:
+            regexps.append(re.compile(regexp))
+    if block.is_comment():
+        if not conf.reg_exp_comments is None:
+            for regexp in conf.reg_exp_comments:
+                regexps.append(re.compile(regexp))
+        for regexp in regexps:
+            if regexp.search(block.content):
+                match = True
+        if match:
+            print_block(conf, block)
+    elif block.is_code():
+        regexps_string = regexps.copy()
+        regexps_code = regexps.copy()
+        if not conf.reg_exp_string_literals is None:
+            for regexp in conf.reg_exp_string_literals:
+                regexps_string.append(re.compile(regexp))
+        if not conf.reg_exp_source is None:
+            for regexp in conf.reg_exp_source:
+                regexps_code.append(re.compile(regexp))
+        string_match = False
+        for regexp in regexps_string:
+            for string in block.string_literals:
+                if regexp.search(string):
+                    string_match = True
+        code_match = False
+        for regexp in regexps_code:
+            if regexp.search(block.content):
+                code_match = True
+        match = string_match or code_match            
+        if match:
+            print_block(conf, block, string_match, code_match)
    return match

-
-'''     if comment and block.is_comment():
-        if regexp is None or rgx.match(block.content):
-            if verbose:
-                print(block)
-            match = True
-    elif block.is_code():
-        if program:
-            if regexp is None or rgx.match(block.content):
-                if verbose:
-                    print(block)
-                match = True
-        if strings:
-            for string in block.string_literals:
-                if regexp is None or rgx.match(string):
-                    if verbose:
-                        print(string)
-                    match = True
- '''
-
-
-def analyze_crx(args, crx, path):
+def analyze_crx(conf, crx):
    match = False
    if path is None:
        with ZipFile(crx) as crxobj:
@ -136,7 +157,7 @@ def analyze_crx(args, crx, path):
    return match


-def analyze_tar(args, date, path, filename):
+def analyze_tar(conf, tarfile):
    last_crx_file = ''
    match = False
    extid = os.path.splitext(os.path.basename(filename))[0]
@ -160,29 +181,17 @@ def analyze_tar(args, date, path, filename):
        else:
            print("RegExp not found in " + extid)

-
-def process_group(args, taskid, maxtaskid, date, path):
-    archive_dir = os.path.join(args.archive_dir, "data")
-    ext_ids = get_existing_ids(archive_dir)
-    chunksize = int(len(ext_ids) / maxtaskid)
-    if taskid == maxtaskid:
-        ext_ids = ext_ids[(taskid - 1) * chunksize:]
-    else:
-        ext_ids = ext_ids[(taskid - 1) * chunksize:taskid * chunksize]
-
-    ext_ids = list(map(partial(archive_file, archive_dir), ext_ids))
-
-    with Pool(args.parallel) as p:
-        p.map(partial(analyze_tar, args, date, path), ext_ids)
+def analyze_file(conf, filename):
+    with open(filename, 'rb') as fileobj:
+        data = fileobj.read()
+    return jsstrings_data(conf, filename, data)


 def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
    """Function for computing list of tasks."""
+    extid_re = re.compile('^[a-p]+$')
    tasks = []
-    for file_or_extid in file_or_extids:
-        path = None
-        date = None
-        extid_re = re.compile('^[a-p]+$')
+    for file_or_extid in file_or_extids: 
        if file_or_extid.endswith('.crx'):
            tasks.append(file_or_extid)
        elif file_or_extid.endswith('.tar'):
@ -211,43 +220,41 @@ def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
    else:
        tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]

-
    return tasks


-def main(args):
-    """Main function: JavaScript strings on steroids."""
-    if args.verbose:
-        loglevel = logging.INFO
+def analyze_task(conf, task):
+    """Analyze one file/tar/crx/extid."""
+    logging.debug("Analyzing " + task)
+    extid_re = re.compile('^[a-p]+$')
+    retval = False
+    if task.endswith('.crx'):
+        retval = analyze_crx(conf, task)
+    elif task.endswith('.tar'):
+        retval = analyze_tar(conf, task)
+    elif extid_re.match(task):
+        retval = analyze_tar(conf, task + '.tar')
    else:
-        loglevel = logging.WARNING
+        retval = analyze_file(conf, task)
+    return retval
+
+
+def main(conf):
+    """Main function: JavaScript strings on steroids."""
    logger = logging.getLogger()
    ch = logging.StreamHandler(sys.stdout)
    ch.setFormatter(logging.Formatter(const_log_format()))
    logger.addHandler(ch)
-    logger.setLevel(loglevel)
+    if conf.verbose:
+        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.WARNING)

-    print(vars(args))
-
-    tasks = compute_tasks(args.FILE_OR_EXTID, args.taskid, args.max_taskid)
-
-    print(tasks)
-
-    for file_or_extid in tasks:
-        if file_or_extid.endswith('.crx'):
-            analyze_crx(args, file_or_extid, path)
-        elif file_or_extid.endswith('.tar'):
-            analyze_tar(args, date, path, file_or_extid)
-        elif extid_re.match(file_or_extid):
-            extid = file_or_extid
-            file_or_extid = os.path.join(args.basedir, 'data',
-                                         get_local_archive_dir(extid),
-                                         extid + ".tar")
-            analyze_tar(args, date, path, filename)
-        else:
-            with open(file_or_extid, 'rb') as fileobj:
-                data = fileobj.read()
-            jsstrings_data(filename, data, args)
+    print(vars(conf))
+    tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
+    with Pool(conf.parallel) as p:
+        retvals = p.map(partial(analyze_task, conf), tasks)
+    return reduce(operator.or_, retvals, False)


 if __name__ == "__main__":
@ -302,7 +309,12 @@ if __name__ == "__main__":
    main_parser.add_argument(
        '-n', '--taskid', metavar='n', type=int, default=1, help='task id')
    main_parser.add_argument(
-        '-N', '--max-taskid', metavar='N', type=int, default=1, help='max task id')
+        '-N',
+        '--max-taskid',
+        metavar='N',
+        type=int,
+        default=1,
+        help='max task id')

    main_parser.add_argument(
        'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
@ -348,6 +360,6 @@ if __name__ == "__main__":
        type=str,
        nargs='+',
        help='search string literals for regular expression')
-    args = main_parser.parse_args()
+    main_conf = main_parser.parse_args()

-    main(args)
+    main(main_conf)