Restored basic functionalty for single files.

This commit is contained in:
Achim D. Brucker 2017-10-06 15:47:59 +01:00
parent ff8023073f
commit 5dde0a79c4
1 changed files with 96 additions and 84 deletions

View File

@ -17,17 +17,18 @@
#
"""Tool for extracting crx file from a tar archive."""
import collections
import datetime
import argparse
import io
import fnmatch
import os
import logging
import re
import sys
import operator
import tarfile
import zlib
from functools import partial
from functools import partial, reduce
from multiprocessing import Pool
from zipfile import ZipFile
@ -45,9 +46,17 @@ from ExtensionCrawler.js_mincer import mince_js
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
def jsstrings_data(path, data, args):
def jsstrings_data(conf, path, data):
"""Analyze data in memory."""
if not conf.file_pattern is None:
if path is None:
return False
elif not fnmatch.fnmatch(path, conf.file_pattern):
logging.debug("Filename \'" + path + "\' does not match pattern \'" + conf.file_pattern + "\'")
return False
match = False
print("## Analyzing " + path)
logging.debug("Start analyzing " + path)
file_info = init_file_info(path, data)
if file_info['size'] == 0:
return match
@ -57,7 +66,7 @@ def jsstrings_data(path, data, args):
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
dec_data = dec.decompress(data, 100 * file_info['size'])
if file_info['dec_encoding'] is None:
logging.warning("Encoding is None for " + path +
logging.debug("Encoding is None for " + path +
" using utf-8.")
str_data = dec_data.decode('UTF-8')
else:
@ -72,49 +81,61 @@ def jsstrings_data(path, data, args):
else:
str_data = data.decode(file_info['encoding'])
if args.beautify:
if conf.beautify:
str_data = jsbeautifier.beautify(str_data)
with io.StringIO(str_data) as str_obj:
for block in mince_js(
str_obj,
single_line_comments_block=args.group_single_line_comments):
if analyze_block(args, block):
single_line_comments_block=conf.group_single_line_comments):
if analyze_block(conf, block):
match = True
return match
def print_block(conf, block, string_match = False, code_match = False):
print(block)
def analyze_block(args, block):
def analyze_block(conf, block):
"""Print code/comment blocks."""
match = False
rgx = None
if args.reg_exp is not None:
rgx = re.compile(args.reg_exp)
regexps = []
if not conf.reg_exp is None:
for regexp in conf.reg_exp:
regexps.append(re.compile(regexp))
if block.is_comment():
if not conf.reg_exp_comments is None:
for regexp in conf.reg_exp_comments:
regexps.append(re.compile(regexp))
for regexp in regexps:
if regexp.search(block.content):
match = True
if match:
print_block(conf, block)
elif block.is_code():
regexps_string = regexps.copy()
regexps_code = regexps.copy()
if not conf.reg_exp_string_literals is None:
for regexp in conf.reg_exp_string_literals:
regexps_string.append(re.compile(regexp))
if not conf.reg_exp_source is None:
for regexp in conf.reg_exp_source:
regexps_code.append(re.compile(regexp))
string_match = False
for regexp in regexps_string:
for string in block.string_literals:
if regexp.search(string):
string_match = True
code_match = False
for regexp in regexps_code:
if regexp.search(block.content):
code_match = True
match = string_match or code_match
if match:
print_block(conf, block, string_match, code_match)
return match
''' if comment and block.is_comment():
if regexp is None or rgx.match(block.content):
if verbose:
print(block)
match = True
elif block.is_code():
if program:
if regexp is None or rgx.match(block.content):
if verbose:
print(block)
match = True
if strings:
for string in block.string_literals:
if regexp is None or rgx.match(string):
if verbose:
print(string)
match = True
'''
def analyze_crx(args, crx, path):
def analyze_crx(conf, crx):
match = False
if path is None:
with ZipFile(crx) as crxobj:
@ -136,7 +157,7 @@ def analyze_crx(args, crx, path):
return match
def analyze_tar(args, date, path, filename):
def analyze_tar(conf, tarfile):
last_crx_file = ''
match = False
extid = os.path.splitext(os.path.basename(filename))[0]
@ -160,29 +181,17 @@ def analyze_tar(args, date, path, filename):
else:
print("RegExp not found in " + extid)
def process_group(args, taskid, maxtaskid, date, path):
archive_dir = os.path.join(args.archive_dir, "data")
ext_ids = get_existing_ids(archive_dir)
chunksize = int(len(ext_ids) / maxtaskid)
if taskid == maxtaskid:
ext_ids = ext_ids[(taskid - 1) * chunksize:]
else:
ext_ids = ext_ids[(taskid - 1) * chunksize:taskid * chunksize]
ext_ids = list(map(partial(archive_file, archive_dir), ext_ids))
with Pool(args.parallel) as p:
p.map(partial(analyze_tar, args, date, path), ext_ids)
def analyze_file(conf, filename):
with open(filename, 'rb') as fileobj:
data = fileobj.read()
return jsstrings_data(conf, filename, data)
def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
"""Function for computing list of tasks."""
extid_re = re.compile('^[a-p]+$')
tasks = []
for file_or_extid in file_or_extids:
path = None
date = None
extid_re = re.compile('^[a-p]+$')
for file_or_extid in file_or_extids:
if file_or_extid.endswith('.crx'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.tar'):
@ -211,43 +220,41 @@ def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
else:
tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]
return tasks
def main(args):
"""Main function: JavaScript strings on steroids."""
if args.verbose:
loglevel = logging.INFO
def analyze_task(conf, task):
"""Analyze one file/tar/crx/extid."""
logging.debug("Analyzing " + task)
extid_re = re.compile('^[a-p]+$')
retval = False
if task.endswith('.crx'):
retval = analyze_crx(conf, task)
elif task.endswith('.tar'):
retval = analyze_tar(conf, task)
elif extid_re.match(task):
retval = analyze_tar(conf, task + '.tar')
else:
loglevel = logging.WARNING
retval = analyze_file(conf, task)
return retval
def main(conf):
"""Main function: JavaScript strings on steroids."""
logger = logging.getLogger()
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(logging.Formatter(const_log_format()))
logger.addHandler(ch)
logger.setLevel(loglevel)
if conf.verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
print(vars(args))
tasks = compute_tasks(args.FILE_OR_EXTID, args.taskid, args.max_taskid)
print(tasks)
for file_or_extid in tasks:
if file_or_extid.endswith('.crx'):
analyze_crx(args, file_or_extid, path)
elif file_or_extid.endswith('.tar'):
analyze_tar(args, date, path, file_or_extid)
elif extid_re.match(file_or_extid):
extid = file_or_extid
file_or_extid = os.path.join(args.basedir, 'data',
get_local_archive_dir(extid),
extid + ".tar")
analyze_tar(args, date, path, filename)
else:
with open(file_or_extid, 'rb') as fileobj:
data = fileobj.read()
jsstrings_data(filename, data, args)
print(vars(conf))
tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
with Pool(conf.parallel) as p:
retvals = p.map(partial(analyze_task, conf), tasks)
return reduce(operator.or_, retvals, False)
if __name__ == "__main__":
@ -302,7 +309,12 @@ if __name__ == "__main__":
main_parser.add_argument(
'-n', '--taskid', metavar='n', type=int, default=1, help='task id')
main_parser.add_argument(
'-N', '--max-taskid', metavar='N', type=int, default=1, help='max task id')
'-N',
'--max-taskid',
metavar='N',
type=int,
default=1,
help='max task id')
main_parser.add_argument(
'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
@ -348,6 +360,6 @@ if __name__ == "__main__":
type=str,
nargs='+',
help='search string literals for regular expression')
args = main_parser.parse_args()
main_conf = main_parser.parse_args()
main(args)
main(main_conf)