ExtensionCrawler/crx-jsstrings

361 lines
12 KiB
Plaintext
Raw Normal View History

2017-09-01 13:12:05 +00:00
#!/usr/bin/env python3.5
#
# Copyright (C) 2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""Tool for extracting crx file from a tar archive."""
import datetime
import argparse
import io
import fnmatch
import os
2017-09-22 07:01:09 +00:00
import logging
import re
import sys
import operator
import tarfile
import zlib
from functools import partial, reduce
from multiprocessing import Pool
from zipfile import ZipFile
import dateutil
import dateutil.parser
import jsbeautifier
2017-09-22 07:01:09 +00:00
from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import get_existing_ids, last_crx
2017-09-23 21:41:04 +00:00
from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import mince_js
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
2017-10-04 21:06:17 +00:00
def jsstrings_data(conf, path, data):
"""Analyze data in memory."""
if not conf.file_pattern is None:
if path is None:
return False
elif not fnmatch.fnmatch(path, conf.file_pattern):
logging.debug("Filename \'" + path + "\' does not match pattern \'" + conf.file_pattern + "\'")
return False
match = False
logging.debug("Start analyzing " + path)
file_info = init_file_info(path, data)
2017-09-21 23:15:46 +00:00
if file_info['size'] == 0:
2017-09-22 07:44:10 +00:00
return match
if not file_info['dec_encoding'] is None:
try:
2017-09-22 07:44:10 +00:00
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
dec_data = dec.decompress(data, 100 * file_info['size'])
2017-09-22 07:01:09 +00:00
if file_info['dec_encoding'] is None:
logging.debug("Encoding is None for " + path +
2017-09-23 21:41:04 +00:00
" using utf-8.")
2017-09-22 07:44:10 +00:00
str_data = dec_data.decode('UTF-8')
2017-09-22 07:01:09 +00:00
else:
str_data = dec_data.decode(file_info['dec_encoding'])
del dec_data
except Exception:
2017-09-22 07:44:10 +00:00
return match
else:
2017-09-22 07:44:10 +00:00
if file_info['encoding'] is None:
2017-09-23 21:41:04 +00:00
logging.warning("Encoding is None for " + path + " using utf-8.")
2017-09-22 07:44:10 +00:00
str_data = data.decode('UTF-8')
else:
str_data = data.decode(file_info['encoding'])
2017-09-23 21:41:04 +00:00
if conf.beautify:
str_data = jsbeautifier.beautify(str_data)
with io.StringIO(str_data) as str_obj:
2017-09-21 20:12:00 +00:00
for block in mince_js(
2017-10-04 21:06:17 +00:00
str_obj,
single_line_comments_block=conf.group_single_line_comments):
if analyze_block(conf, block):
match = True
return match
def print_block(conf, block, string_match = False, code_match = False):
print(block)
def analyze_block(conf, block):
"""Print code/comment blocks."""
match = False
regexps = []
if not conf.reg_exp is None:
for regexp in conf.reg_exp:
regexps.append(re.compile(regexp))
if block.is_comment():
if not conf.reg_exp_comments is None:
for regexp in conf.reg_exp_comments:
regexps.append(re.compile(regexp))
for regexp in regexps:
if regexp.search(block.content):
match = True
if match:
print_block(conf, block)
elif block.is_code():
regexps_string = regexps.copy()
regexps_code = regexps.copy()
if not conf.reg_exp_string_literals is None:
for regexp in conf.reg_exp_string_literals:
regexps_string.append(re.compile(regexp))
if not conf.reg_exp_source is None:
for regexp in conf.reg_exp_source:
regexps_code.append(re.compile(regexp))
string_match = False
for regexp in regexps_string:
for string in block.string_literals:
if regexp.search(string):
string_match = True
code_match = False
for regexp in regexps_code:
if regexp.search(block.content):
code_match = True
match = string_match or code_match
if match:
print_block(conf, block, string_match, code_match)
return match
2017-10-04 21:06:17 +00:00
def analyze_crx(conf, crx):
2017-10-06 15:02:33 +00:00
"""Analyze crx file."""
match = False
2017-10-06 15:02:33 +00:00
with ZipFile(crx) as crxobj:
js_files = list(
filter(
lambda x: x.filename.endswith(".js") or x.filename.endswith(".js.gz") or x.filename.endswith(".jgz") or x.filename.endswith(".jsg") or x.filename.endswith(".css.gz"),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
if jsstrings_data(conf, path, data):
match = True
2017-10-06 15:02:33 +00:00
return match
def analyze_tar(conf, tarfile):
last_crx_file = ''
match = False
extid = os.path.splitext(os.path.basename(filename))[0]
if date is not None:
dateobj = dateutil.parser.parse(date)
if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
last_crx_file = last_crx(
os.path.join(args.archive_dir, "data"), extid, dateobj)
else:
last_crx_file = last_crx(os.path.join(args.archive_dir, "data"), extid)
2017-09-21 23:55:49 +00:00
if last_crx_file == "" or last_crx_file is None:
2017-09-23 21:41:04 +00:00
print("No crx in " + extid)
2017-09-21 23:55:49 +00:00
else:
print("# Start analyzing " + extid)
with tarfile.open(filename, 'r') as archive:
with archive.extractfile(last_crx_file) as crx:
match = analyze_crx(args, crx, path)
2017-09-21 23:55:49 +00:00
if match:
print("RegExp found in " + extid)
else:
print("RegExp not found in " + extid)
def analyze_file(conf, filename):
with open(filename, 'rb') as fileobj:
data = fileobj.read()
return jsstrings_data(conf, filename, data)
2017-10-05 21:33:08 +00:00
def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
2017-10-04 21:06:17 +00:00
"""Function for computing list of tasks."""
extid_re = re.compile('^[a-p]+$')
2017-10-04 21:06:17 +00:00
tasks = []
for file_or_extid in file_or_extids:
2017-10-04 21:06:17 +00:00
if file_or_extid.endswith('.crx'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.tar'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.css'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.js'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.c'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.java'):
tasks.append(file_or_extid)
elif extid_re.match(file_or_extid):
tasks.append(file_or_extid)
else:
# default: a file with extension ide
with open(file_or_extid) as fileobj:
for line in fileobj:
line = line.strip()
if extid_re.match(line):
tasks.append(line)
2017-10-05 21:33:08 +00:00
chunksize = int(len(tasks) / maxtaskid)
if taskid == maxtaskid:
tasks = tasks[(taskid - 1) * chunksize:]
else:
tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]
2017-10-04 21:06:17 +00:00
return tasks
def analyze_task(conf, task):
"""Analyze one file/tar/crx/extid."""
logging.debug("Analyzing " + task)
extid_re = re.compile('^[a-p]+$')
retval = False
if task.endswith('.crx'):
retval = analyze_crx(conf, task)
elif task.endswith('.tar'):
retval = analyze_tar(conf, task)
elif extid_re.match(task):
retval = analyze_tar(conf, task + '.tar')
2017-10-02 19:23:45 +00:00
else:
retval = analyze_file(conf, task)
return retval
def main(conf):
"""Main function: JavaScript strings on steroids."""
2017-10-02 19:23:45 +00:00
logger = logging.getLogger()
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(logging.Formatter(const_log_format()))
logger.addHandler(ch)
if conf.verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
2017-10-04 21:06:17 +00:00
print(vars(conf))
tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
with Pool(conf.parallel) as p:
retvals = p.map(partial(analyze_task, conf), tasks)
return reduce(operator.or_, retvals, False)
2017-09-21 20:12:00 +00:00
if __name__ == "__main__":
main_parser = argparse.ArgumentParser(
description=
'A combination of strings and grep for JavaScript and CSS files.')
main_parser.add_argument(
'-r',
'--reg-exp',
metavar='REGEXP',
type=str,
nargs='+',
help='search for regular expression')
2017-10-02 19:23:45 +00:00
main_parser.add_argument(
'-v',
'--verbose',
action='store_true',
2017-10-02 19:23:45 +00:00
default=False,
help='increase verbosity')
2017-10-04 21:06:17 +00:00
main_parser.add_argument(
'-d',
'--output-decoration',
metavar='L',
choices=[0, 1, 2, 3],
type=int,
help='show only matching files, crx, tar')
main_parser.add_argument(
'-p',
'--parallel',
metavar='P',
type=int,
2017-10-04 21:06:17 +00:00
help='run P threads in parallel')
main_parser.add_argument(
'-f',
'--file-pattern',
metavar='pattern',
type=str,
help='process only files matching pattern')
2017-10-04 21:06:17 +00:00
main_parser.add_argument(
'-a',
'--archive-dir',
metavar='archive',
type=str,
2017-10-02 19:28:57 +00:00
default=const_basedir(),
help='archive directory')
main_parser.add_argument(
'-C', '--colorize', action='store_true', help='use colors')
main_parser.add_argument(
2017-10-05 21:33:08 +00:00
'-n', '--taskid', metavar='n', type=int, default=1, help='task id')
main_parser.add_argument(
'-N',
'--max-taskid',
metavar='N',
type=int,
default=1,
help='max task id')
2017-10-01 07:39:44 +00:00
main_parser.add_argument(
'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
comment_group = main_parser.add_argument_group('comment blocks')
2017-10-01 07:39:44 +00:00
comment_group.add_argument(
'-g',
'--group-single-line-comments',
help='Group consecutive singe-line comments into blocks')
comment_group.add_argument(
'-c',
'--reg-exp-comments',
metavar='REGEXP',
type=str,
nargs='+',
help='search comments for regular expression')
source_group = main_parser.add_argument_group('source blocks')
source_group.add_argument(
2017-10-04 21:06:17 +00:00
'-b',
'--beautify',
action='store_true',
default=False,
help='beautify source code')
source_group.add_argument(
'-s',
'--reg-exp-source',
metavar='REGEXP',
type=str,
nargs='+',
help='search source for regular expression')
strings_group = main_parser.add_argument_group('string literals')
2017-09-29 04:46:25 +00:00
strings_group.add_argument(
2017-10-01 07:39:44 +00:00
'-j',
'--join-string-literals',
action='store_true',
help='join string literals (heuristic)')
2017-09-29 04:46:25 +00:00
strings_group.add_argument(
'-l',
'--reg-exp-string-literals',
metavar='REGEXP',
type=str,
nargs='+',
help='search string literals for regular expression')
main_conf = main_parser.parse_args()
main(main_conf)