Restored basic functionalty for single files.
This commit is contained in:
parent
ff8023073f
commit
5dde0a79c4
180
crx-jsstrings
180
crx-jsstrings
|
@ -17,17 +17,18 @@
|
|||
#
|
||||
"""Tool for extracting crx file from a tar archive."""
|
||||
|
||||
import collections
|
||||
import datetime
|
||||
import argparse
|
||||
import io
|
||||
import fnmatch
|
||||
import os
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import operator
|
||||
import tarfile
|
||||
import zlib
|
||||
from functools import partial
|
||||
from functools import partial, reduce
|
||||
from multiprocessing import Pool
|
||||
from zipfile import ZipFile
|
||||
|
||||
|
@ -45,9 +46,17 @@ from ExtensionCrawler.js_mincer import mince_js
|
|||
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
|
||||
|
||||
|
||||
def jsstrings_data(path, data, args):
|
||||
def jsstrings_data(conf, path, data):
|
||||
"""Analyze data in memory."""
|
||||
if not conf.file_pattern is None:
|
||||
if path is None:
|
||||
return False
|
||||
elif not fnmatch.fnmatch(path, conf.file_pattern):
|
||||
logging.debug("Filename \'" + path + "\' does not match pattern \'" + conf.file_pattern + "\'")
|
||||
return False
|
||||
|
||||
match = False
|
||||
print("## Analyzing " + path)
|
||||
logging.debug("Start analyzing " + path)
|
||||
file_info = init_file_info(path, data)
|
||||
if file_info['size'] == 0:
|
||||
return match
|
||||
|
@ -57,7 +66,7 @@ def jsstrings_data(path, data, args):
|
|||
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
|
||||
dec_data = dec.decompress(data, 100 * file_info['size'])
|
||||
if file_info['dec_encoding'] is None:
|
||||
logging.warning("Encoding is None for " + path +
|
||||
logging.debug("Encoding is None for " + path +
|
||||
" using utf-8.")
|
||||
str_data = dec_data.decode('UTF-8')
|
||||
else:
|
||||
|
@ -72,49 +81,61 @@ def jsstrings_data(path, data, args):
|
|||
else:
|
||||
str_data = data.decode(file_info['encoding'])
|
||||
|
||||
if args.beautify:
|
||||
if conf.beautify:
|
||||
str_data = jsbeautifier.beautify(str_data)
|
||||
|
||||
with io.StringIO(str_data) as str_obj:
|
||||
for block in mince_js(
|
||||
str_obj,
|
||||
single_line_comments_block=args.group_single_line_comments):
|
||||
if analyze_block(args, block):
|
||||
single_line_comments_block=conf.group_single_line_comments):
|
||||
if analyze_block(conf, block):
|
||||
match = True
|
||||
|
||||
return match
|
||||
|
||||
def print_block(conf, block, string_match = False, code_match = False):
|
||||
print(block)
|
||||
|
||||
def analyze_block(args, block):
|
||||
def analyze_block(conf, block):
|
||||
"""Print code/comment blocks."""
|
||||
match = False
|
||||
rgx = None
|
||||
if args.reg_exp is not None:
|
||||
rgx = re.compile(args.reg_exp)
|
||||
regexps = []
|
||||
if not conf.reg_exp is None:
|
||||
for regexp in conf.reg_exp:
|
||||
regexps.append(re.compile(regexp))
|
||||
if block.is_comment():
|
||||
if not conf.reg_exp_comments is None:
|
||||
for regexp in conf.reg_exp_comments:
|
||||
regexps.append(re.compile(regexp))
|
||||
for regexp in regexps:
|
||||
if regexp.search(block.content):
|
||||
match = True
|
||||
if match:
|
||||
print_block(conf, block)
|
||||
elif block.is_code():
|
||||
regexps_string = regexps.copy()
|
||||
regexps_code = regexps.copy()
|
||||
if not conf.reg_exp_string_literals is None:
|
||||
for regexp in conf.reg_exp_string_literals:
|
||||
regexps_string.append(re.compile(regexp))
|
||||
if not conf.reg_exp_source is None:
|
||||
for regexp in conf.reg_exp_source:
|
||||
regexps_code.append(re.compile(regexp))
|
||||
string_match = False
|
||||
for regexp in regexps_string:
|
||||
for string in block.string_literals:
|
||||
if regexp.search(string):
|
||||
string_match = True
|
||||
code_match = False
|
||||
for regexp in regexps_code:
|
||||
if regexp.search(block.content):
|
||||
code_match = True
|
||||
match = string_match or code_match
|
||||
if match:
|
||||
print_block(conf, block, string_match, code_match)
|
||||
return match
|
||||
|
||||
|
||||
''' if comment and block.is_comment():
|
||||
if regexp is None or rgx.match(block.content):
|
||||
if verbose:
|
||||
print(block)
|
||||
match = True
|
||||
elif block.is_code():
|
||||
if program:
|
||||
if regexp is None or rgx.match(block.content):
|
||||
if verbose:
|
||||
print(block)
|
||||
match = True
|
||||
if strings:
|
||||
for string in block.string_literals:
|
||||
if regexp is None or rgx.match(string):
|
||||
if verbose:
|
||||
print(string)
|
||||
match = True
|
||||
'''
|
||||
|
||||
|
||||
def analyze_crx(args, crx, path):
|
||||
def analyze_crx(conf, crx):
|
||||
match = False
|
||||
if path is None:
|
||||
with ZipFile(crx) as crxobj:
|
||||
|
@ -136,7 +157,7 @@ def analyze_crx(args, crx, path):
|
|||
return match
|
||||
|
||||
|
||||
def analyze_tar(args, date, path, filename):
|
||||
def analyze_tar(conf, tarfile):
|
||||
last_crx_file = ''
|
||||
match = False
|
||||
extid = os.path.splitext(os.path.basename(filename))[0]
|
||||
|
@ -160,29 +181,17 @@ def analyze_tar(args, date, path, filename):
|
|||
else:
|
||||
print("RegExp not found in " + extid)
|
||||
|
||||
|
||||
def process_group(args, taskid, maxtaskid, date, path):
|
||||
archive_dir = os.path.join(args.archive_dir, "data")
|
||||
ext_ids = get_existing_ids(archive_dir)
|
||||
chunksize = int(len(ext_ids) / maxtaskid)
|
||||
if taskid == maxtaskid:
|
||||
ext_ids = ext_ids[(taskid - 1) * chunksize:]
|
||||
else:
|
||||
ext_ids = ext_ids[(taskid - 1) * chunksize:taskid * chunksize]
|
||||
|
||||
ext_ids = list(map(partial(archive_file, archive_dir), ext_ids))
|
||||
|
||||
with Pool(args.parallel) as p:
|
||||
p.map(partial(analyze_tar, args, date, path), ext_ids)
|
||||
def analyze_file(conf, filename):
|
||||
with open(filename, 'rb') as fileobj:
|
||||
data = fileobj.read()
|
||||
return jsstrings_data(conf, filename, data)
|
||||
|
||||
|
||||
def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
|
||||
"""Function for computing list of tasks."""
|
||||
extid_re = re.compile('^[a-p]+$')
|
||||
tasks = []
|
||||
for file_or_extid in file_or_extids:
|
||||
path = None
|
||||
date = None
|
||||
extid_re = re.compile('^[a-p]+$')
|
||||
for file_or_extid in file_or_extids:
|
||||
if file_or_extid.endswith('.crx'):
|
||||
tasks.append(file_or_extid)
|
||||
elif file_or_extid.endswith('.tar'):
|
||||
|
@ -211,43 +220,41 @@ def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
|
|||
else:
|
||||
tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]
|
||||
|
||||
|
||||
return tasks
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Main function: JavaScript strings on steroids."""
|
||||
if args.verbose:
|
||||
loglevel = logging.INFO
|
||||
def analyze_task(conf, task):
|
||||
"""Analyze one file/tar/crx/extid."""
|
||||
logging.debug("Analyzing " + task)
|
||||
extid_re = re.compile('^[a-p]+$')
|
||||
retval = False
|
||||
if task.endswith('.crx'):
|
||||
retval = analyze_crx(conf, task)
|
||||
elif task.endswith('.tar'):
|
||||
retval = analyze_tar(conf, task)
|
||||
elif extid_re.match(task):
|
||||
retval = analyze_tar(conf, task + '.tar')
|
||||
else:
|
||||
loglevel = logging.WARNING
|
||||
retval = analyze_file(conf, task)
|
||||
return retval
|
||||
|
||||
|
||||
def main(conf):
|
||||
"""Main function: JavaScript strings on steroids."""
|
||||
logger = logging.getLogger()
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
ch.setFormatter(logging.Formatter(const_log_format()))
|
||||
logger.addHandler(ch)
|
||||
logger.setLevel(loglevel)
|
||||
if conf.verbose:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
print(vars(args))
|
||||
|
||||
tasks = compute_tasks(args.FILE_OR_EXTID, args.taskid, args.max_taskid)
|
||||
|
||||
print(tasks)
|
||||
|
||||
for file_or_extid in tasks:
|
||||
if file_or_extid.endswith('.crx'):
|
||||
analyze_crx(args, file_or_extid, path)
|
||||
elif file_or_extid.endswith('.tar'):
|
||||
analyze_tar(args, date, path, file_or_extid)
|
||||
elif extid_re.match(file_or_extid):
|
||||
extid = file_or_extid
|
||||
file_or_extid = os.path.join(args.basedir, 'data',
|
||||
get_local_archive_dir(extid),
|
||||
extid + ".tar")
|
||||
analyze_tar(args, date, path, filename)
|
||||
else:
|
||||
with open(file_or_extid, 'rb') as fileobj:
|
||||
data = fileobj.read()
|
||||
jsstrings_data(filename, data, args)
|
||||
print(vars(conf))
|
||||
tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
|
||||
with Pool(conf.parallel) as p:
|
||||
retvals = p.map(partial(analyze_task, conf), tasks)
|
||||
return reduce(operator.or_, retvals, False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -302,7 +309,12 @@ if __name__ == "__main__":
|
|||
main_parser.add_argument(
|
||||
'-n', '--taskid', metavar='n', type=int, default=1, help='task id')
|
||||
main_parser.add_argument(
|
||||
'-N', '--max-taskid', metavar='N', type=int, default=1, help='max task id')
|
||||
'-N',
|
||||
'--max-taskid',
|
||||
metavar='N',
|
||||
type=int,
|
||||
default=1,
|
||||
help='max task id')
|
||||
|
||||
main_parser.add_argument(
|
||||
'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
|
||||
|
@ -348,6 +360,6 @@ if __name__ == "__main__":
|
|||
type=str,
|
||||
nargs='+',
|
||||
help='search string literals for regular expression')
|
||||
args = main_parser.parse_args()
|
||||
main_conf = main_parser.parse_args()
|
||||
|
||||
main(args)
|
||||
main(main_conf)
|
||||
|
|
Loading…
Reference in New Issue