2017-09-01 13:12:05 +00:00
#!/usr/bin/env python3.5
2017-08-30 16:15:51 +00:00
#
# Copyright (C) 2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""Tool for extracting crx file from a tar archive."""
2017-09-21 22:35:19 +00:00
import datetime
2017-09-27 22:24:45 +00:00
import argparse
2017-08-30 16:15:51 +00:00
import io
2017-10-06 14:47:59 +00:00
import fnmatch
2017-09-21 21:23:50 +00:00
import os
2017-09-22 07:01:09 +00:00
import logging
2017-09-21 22:35:19 +00:00
import re
2017-09-21 20:11:16 +00:00
import sys
2017-10-06 14:47:59 +00:00
import operator
2017-09-21 22:35:19 +00:00
import tarfile
2017-09-21 20:11:16 +00:00
import zlib
2017-10-06 14:47:59 +00:00
from functools import partial, reduce
2017-09-21 22:35:19 +00:00
from multiprocessing import Pool
2017-08-30 16:15:51 +00:00
from zipfile import ZipFile
2017-09-21 22:35:19 +00:00
2017-09-21 21:23:50 +00:00
import dateutil
import dateutil.parser
2017-09-21 20:11:16 +00:00
import jsbeautifier
2017-09-22 07:01:09 +00:00
from ExtensionCrawler.config import (const_log_format, const_basedir)
2017-09-21 22:35:19 +00:00
from ExtensionCrawler.archive import get_existing_ids, last_crx
2017-09-23 21:41:04 +00:00
from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
2017-09-21 20:11:16 +00:00
from ExtensionCrawler.js_decomposer import init_file_info
2017-09-21 21:23:50 +00:00
from ExtensionCrawler.js_mincer import mince_js
2017-08-30 16:15:51 +00:00
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
2017-10-04 21:06:17 +00:00
2017-10-06 14:47:59 +00:00
def jsstrings_data(conf, path, data):
"""Analyze data in memory."""
if not conf.file_pattern is None:
if path is None:
return False
elif not fnmatch.fnmatch(path, conf.file_pattern):
logging.debug("Filename \'" + path + "\' does not match pattern \'" + conf.file_pattern + "\'")
return False
2017-09-21 23:39:54 +00:00
match = False
2017-10-06 14:47:59 +00:00
logging.debug("Start analyzing " + path)
2017-09-21 20:11:16 +00:00
file_info = init_file_info(path, data)
2017-09-21 23:15:46 +00:00
if file_info['size'] == 0:
2017-09-22 07:44:10 +00:00
return match
2017-09-21 20:11:16 +00:00
if not file_info['dec_encoding'] is None:
try:
2017-09-22 07:44:10 +00:00
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
dec_data = dec.decompress(data, 100 * file_info['size'])
2017-09-22 07:01:09 +00:00
if file_info['dec_encoding'] is None:
2017-10-06 14:47:59 +00:00
logging.debug("Encoding is None for " + path +
2017-09-23 21:41:04 +00:00
" using utf-8.")
2017-09-22 07:44:10 +00:00
str_data = dec_data.decode('UTF-8')
2017-09-22 07:01:09 +00:00
else:
str_data = dec_data.decode(file_info['dec_encoding'])
2017-09-21 20:11:16 +00:00
del dec_data
except Exception:
2017-09-22 07:44:10 +00:00
return match
2017-09-21 20:11:16 +00:00
else:
2017-09-22 07:44:10 +00:00
if file_info['encoding'] is None:
2017-09-23 21:41:04 +00:00
logging.warning("Encoding is None for " + path + " using utf-8.")
2017-09-22 07:44:10 +00:00
str_data = data.decode('UTF-8')
else:
str_data = data.decode(file_info['encoding'])
2017-09-23 21:41:04 +00:00
2017-10-06 14:47:59 +00:00
if conf.beautify:
2017-09-21 20:11:16 +00:00
str_data = jsbeautifier.beautify(str_data)
2017-09-21 21:23:50 +00:00
with io.StringIO(str_data) as str_obj:
2017-09-21 20:12:00 +00:00
for block in mince_js(
2017-10-04 21:06:17 +00:00
str_obj,
2017-10-06 14:47:59 +00:00
single_line_comments_block=conf.group_single_line_comments):
if analyze_block(conf, block):
2017-09-21 23:39:54 +00:00
match = True
2017-09-21 20:11:16 +00:00
2017-09-21 23:39:54 +00:00
return match
2017-10-06 14:47:59 +00:00
def print_block(conf, block, string_match = False, code_match = False):
print(block)
2017-09-28 05:13:03 +00:00
2017-10-06 14:47:59 +00:00
def analyze_block(conf, block):
2017-08-30 16:15:51 +00:00
"""Print code/comment blocks."""
2017-09-21 23:39:54 +00:00
match = False
2017-10-06 14:47:59 +00:00
regexps = []
if not conf.reg_exp is None:
for regexp in conf.reg_exp:
regexps.append(re.compile(regexp))
if block.is_comment():
if not conf.reg_exp_comments is None:
for regexp in conf.reg_exp_comments:
regexps.append(re.compile(regexp))
for regexp in regexps:
if regexp.search(block.content):
2017-09-21 23:39:54 +00:00
match = True
2017-10-06 14:47:59 +00:00
if match:
print_block(conf, block)
elif block.is_code():
regexps_string = regexps.copy()
regexps_code = regexps.copy()
if not conf.reg_exp_string_literals is None:
for regexp in conf.reg_exp_string_literals:
regexps_string.append(re.compile(regexp))
if not conf.reg_exp_source is None:
for regexp in conf.reg_exp_source:
regexps_code.append(re.compile(regexp))
string_match = False
for regexp in regexps_string:
2017-08-30 16:15:51 +00:00
for string in block.string_literals:
2017-10-06 14:47:59 +00:00
if regexp.search(string):
string_match = True
code_match = False
for regexp in regexps_code:
if regexp.search(block.content):
code_match = True
match = string_match or code_match
if match:
print_block(conf, block, string_match, code_match)
return match
2017-10-04 21:06:17 +00:00
2017-10-06 14:47:59 +00:00
def analyze_crx(conf, crx):
2017-10-06 15:02:33 +00:00
"""Analyze crx file."""
2017-09-21 23:39:54 +00:00
match = False
2017-10-06 15:02:33 +00:00
with ZipFile(crx) as crxobj:
js_files = list(
filter(
lambda x: x.filename.endswith(".js") or x.filename.endswith(".js.gz") or x.filename.endswith(".jgz") or x.filename.endswith(".jsg") or x.filename.endswith(".css.gz"),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
if jsstrings_data(conf, path, data):
match = True
2017-09-21 21:23:50 +00:00
2017-10-06 15:02:33 +00:00
return match
2017-09-28 05:13:03 +00:00
2017-10-06 14:47:59 +00:00
def analyze_tar(conf, tarfile):
2017-09-21 21:23:50 +00:00
last_crx_file = ''
2017-09-21 23:39:54 +00:00
match = False
2017-09-21 21:23:50 +00:00
extid = os.path.splitext(os.path.basename(filename))[0]
if date is not None:
dateobj = dateutil.parser.parse(date)
if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
last_crx_file = last_crx(
2017-10-03 20:08:09 +00:00
os.path.join(args.archive_dir, "data"), extid, dateobj)
2017-09-21 21:23:50 +00:00
else:
2017-10-03 20:08:09 +00:00
last_crx_file = last_crx(os.path.join(args.archive_dir, "data"), extid)
2017-09-21 23:55:49 +00:00
if last_crx_file == "" or last_crx_file is None:
2017-09-23 21:41:04 +00:00
print("No crx in " + extid)
2017-09-21 23:55:49 +00:00
else:
print("# Start analyzing " + extid)
with tarfile.open(filename, 'r') as archive:
with archive.extractfile(last_crx_file) as crx:
2017-10-03 20:08:09 +00:00
match = analyze_crx(args, crx, path)
2017-09-21 23:55:49 +00:00
if match:
print("RegExp found in " + extid)
else:
print("RegExp not found in " + extid)
2017-09-21 21:23:50 +00:00
2017-10-06 14:47:59 +00:00
def analyze_file(conf, filename):
with open(filename, 'rb') as fileobj:
data = fileobj.read()
return jsstrings_data(conf, filename, data)
2017-09-21 22:35:19 +00:00
2017-10-05 21:33:08 +00:00
def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
2017-10-04 21:06:17 +00:00
"""Function for computing list of tasks."""
2017-10-06 14:47:59 +00:00
extid_re = re.compile('^[a-p]+$')
2017-10-04 21:06:17 +00:00
tasks = []
2017-10-06 14:47:59 +00:00
for file_or_extid in file_or_extids:
2017-10-04 21:06:17 +00:00
if file_or_extid.endswith('.crx'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.tar'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.css'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.js'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.c'):
tasks.append(file_or_extid)
elif file_or_extid.endswith('.java'):
tasks.append(file_or_extid)
elif extid_re.match(file_or_extid):
tasks.append(file_or_extid)
else:
# default: a file with extension ide
with open(file_or_extid) as fileobj:
for line in fileobj:
line = line.strip()
if extid_re.match(line):
tasks.append(line)
2017-10-05 21:33:08 +00:00
chunksize = int(len(tasks) / maxtaskid)
if taskid == maxtaskid:
tasks = tasks[(taskid - 1) * chunksize:]
else:
tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]
2017-10-04 21:06:17 +00:00
return tasks
2017-10-06 14:47:59 +00:00
def analyze_task(conf, task):
"""Analyze one file/tar/crx/extid."""
logging.debug("Analyzing " + task)
extid_re = re.compile('^[a-p]+$')
retval = False
if task.endswith('.crx'):
retval = analyze_crx(conf, task)
elif task.endswith('.tar'):
retval = analyze_tar(conf, task)
elif extid_re.match(task):
retval = analyze_tar(conf, task + '.tar')
2017-10-02 19:23:45 +00:00
else:
2017-10-06 14:47:59 +00:00
retval = analyze_file(conf, task)
return retval
def main(conf):
"""Main function: JavaScript strings on steroids."""
2017-10-02 19:23:45 +00:00
logger = logging.getLogger()
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(logging.Formatter(const_log_format()))
logger.addHandler(ch)
2017-10-06 14:47:59 +00:00
if conf.verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
2017-10-04 21:06:17 +00:00
2017-10-06 14:47:59 +00:00
print(vars(conf))
tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
with Pool(conf.parallel) as p:
retvals = p.map(partial(analyze_task, conf), tasks)
return reduce(operator.or_, retvals, False)
2017-09-21 20:12:00 +00:00
2017-08-30 16:15:51 +00:00
if __name__ == "__main__":
2017-09-28 05:13:03 +00:00
main_parser = argparse.ArgumentParser(
description=
'A combination of strings and grep for JavaScript and CSS files.')
main_parser.add_argument(
'-r',
'--reg-exp',
metavar='REGEXP',
type=str,
nargs='+',
help='search for regular expression')
2017-10-02 19:23:45 +00:00
main_parser.add_argument(
'-v',
'--verbose',
2017-10-03 20:08:09 +00:00
action='store_true',
2017-10-02 19:23:45 +00:00
default=False,
help='increase verbosity')
2017-10-04 21:06:17 +00:00
2017-09-28 05:13:03 +00:00
main_parser.add_argument(
'-d',
'--output-decoration',
metavar='L',
choices=[0, 1, 2, 3],
type=int,
help='show only matching files, crx, tar')
2017-10-03 20:08:09 +00:00
main_parser.add_argument(
'-p',
'--parallel',
metavar='P',
type=int,
2017-10-04 21:06:17 +00:00
help='run P threads in parallel')
2017-10-03 20:08:09 +00:00
main_parser.add_argument(
'-f',
'--file-pattern',
metavar='pattern',
type=str,
help='process only files matching pattern')
2017-10-04 21:06:17 +00:00
2017-09-28 05:13:03 +00:00
main_parser.add_argument(
'-a',
'--archive-dir',
metavar='archive',
type=str,
2017-10-02 19:28:57 +00:00
default=const_basedir(),
2017-09-28 05:13:03 +00:00
help='archive directory')
main_parser.add_argument(
'-C', '--colorize', action='store_true', help='use colors')
2017-09-30 06:26:38 +00:00
2017-09-28 05:13:03 +00:00
main_parser.add_argument(
2017-10-05 21:33:08 +00:00
'-n', '--taskid', metavar='n', type=int, default=1, help='task id')
2017-09-28 05:13:03 +00:00
main_parser.add_argument(
2017-10-06 14:47:59 +00:00
'-N',
'--max-taskid',
metavar='N',
type=int,
default=1,
help='max task id')
2017-09-28 05:13:03 +00:00
2017-10-01 07:39:44 +00:00
main_parser.add_argument(
'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
2017-09-27 22:24:45 +00:00
2017-09-28 05:13:03 +00:00
comment_group = main_parser.add_argument_group('comment blocks')
2017-10-01 07:39:44 +00:00
comment_group.add_argument(
'-g',
'--group-single-line-comments',
help='Group consecutive singe-line comments into blocks')
2017-09-28 05:13:03 +00:00
comment_group.add_argument(
'-c',
'--reg-exp-comments',
metavar='REGEXP',
type=str,
nargs='+',
help='search comments for regular expression')
source_group = main_parser.add_argument_group('source blocks')
source_group.add_argument(
2017-10-04 21:06:17 +00:00
'-b',
'--beautify',
action='store_true',
default=False,
help='beautify source code')
2017-09-28 05:13:03 +00:00
source_group.add_argument(
'-s',
'--reg-exp-source',
metavar='REGEXP',
type=str,
nargs='+',
help='search source for regular expression')
strings_group = main_parser.add_argument_group('string literals')
2017-09-29 04:46:25 +00:00
strings_group.add_argument(
2017-10-01 07:39:44 +00:00
'-j',
'--join-string-literals',
action='store_true',
help='join string literals (heuristic)')
2017-09-29 04:46:25 +00:00
strings_group.add_argument(
2017-09-28 05:13:03 +00:00
'-l',
'--reg-exp-string-literals',
metavar='REGEXP',
type=str,
nargs='+',
help='search string literals for regular expression')
2017-10-06 14:47:59 +00:00
main_conf = main_parser.parse_args()
2017-09-28 05:13:03 +00:00
2017-10-06 14:47:59 +00:00
main(main_conf)