524 lines
18 KiB
Python
Executable File
524 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3.6
|
|
#
|
|
# Copyright (C) 2017 The University of Sheffield, UK
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
#
|
|
"""Tool for extracting crx file from a tar archive."""
|
|
|
|
import datetime
|
|
import argparse
|
|
import io
|
|
import fnmatch
|
|
import os
|
|
import logging
|
|
import re
|
|
import sys
|
|
import operator
|
|
import tarfile
|
|
import zlib
|
|
from functools import partial, reduce
|
|
from colorama import init, Fore
|
|
from multiprocessing import Pool
|
|
from zipfile import ZipFile
|
|
|
|
import dateutil
|
|
import dateutil.parser
|
|
import jsbeautifier
|
|
|
|
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
|
from ExtensionCrawler.archive import last_crx, first_crx, all_crx
|
|
from ExtensionCrawler.config import get_local_archive_dir
|
|
from ExtensionCrawler.js_decomposer import init_file_info
|
|
from ExtensionCrawler.js_mincer import mince_js
|
|
|
|
|
|
def is_file_with_c_style_comments(filename):
|
|
"""Test if filename indicates file with C-style comment."""
|
|
return (filename.endswith(".js") or filename.endswith(".js.gz")
|
|
or filename.endswith(".jgz") or filename.endswith(".jsg")
|
|
or filename.endswith(".css.gz") or filename.endswith(".c")
|
|
or filename.endswith(".cpp") or filename.endswith(".java"))
|
|
|
|
|
|
def jsstrings_data(conf, path, data):
|
|
"""Analyze data in memory."""
|
|
if conf.file_pattern is not None:
|
|
if path is None:
|
|
return False
|
|
elif not fnmatch.fnmatch(path, conf.file_pattern):
|
|
logging.debug("Filename \'" + path + "\' does not match pattern \'"
|
|
+ conf.file_pattern + "\'")
|
|
return False
|
|
|
|
match = False
|
|
logging.debug("Start analyzing " + path)
|
|
file_info = init_file_info(path, data)
|
|
if file_info['size'] == 0:
|
|
return match
|
|
|
|
if not file_info['dec_encoding'] is None:
|
|
try:
|
|
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
|
|
dec_data = dec.decompress(data, 100 * file_info['size'])
|
|
if file_info['dec_encoding'] is None:
|
|
logging.debug("Encoding is None for " + path + " using utf-8.")
|
|
str_data = dec_data.decode('UTF-8')
|
|
else:
|
|
str_data = dec_data.decode(file_info['dec_encoding'])
|
|
del dec_data
|
|
except Exception:
|
|
return match
|
|
else:
|
|
if file_info['encoding'] is None:
|
|
logging.warning("Encoding is None for " + path + " using utf-8.")
|
|
str_data = data.decode('UTF-8')
|
|
else:
|
|
str_data = data.decode(file_info['encoding'])
|
|
|
|
if conf.beautify:
|
|
str_data = jsbeautifier.beautify(str_data)
|
|
|
|
with io.StringIO(str_data) as str_obj:
|
|
first = True
|
|
for block in mince_js(
|
|
str_obj,
|
|
single_line_comments_block=conf.group_single_line_comments):
|
|
if analyze_block(conf, path, block, first):
|
|
match = True
|
|
first = False
|
|
if match and 0 < conf.output_decoration < 2:
|
|
print(path)
|
|
return match
|
|
|
|
|
|
def print_block(conf,
|
|
path,
|
|
block,
|
|
first=False,
|
|
string_match=False,
|
|
code_match=False):
|
|
if conf.output_decoration > 1:
|
|
line_no = block.start[0]
|
|
prefix = " " * (block.start[1] - 1)
|
|
# TODO: use classifier
|
|
classifier = "X"
|
|
sep = "=" * (len(path) + 17)
|
|
if not first:
|
|
print(sep)
|
|
if block.is_code():
|
|
classifier = "S"
|
|
else:
|
|
classifier = "C"
|
|
|
|
if block.is_code() and string_match:
|
|
if conf.join_string_literals:
|
|
loc = '({0:d} - {1:d})'.format(block.start[0], block.end[0])
|
|
loc = (' ' * (11 - len(loc))) + loc
|
|
line = '{0} {1} [L]: {2}'.format(
|
|
path, loc, block.string_literals[0].rstrip())
|
|
print(line)
|
|
else:
|
|
for (pos, string) in block.string_literals:
|
|
loc = '({0[0]:d}/{0[1]:d})'.format(pos)
|
|
loc = (' ' * (11 - len(loc))) + loc
|
|
line = '{0} {1} [L]: {2}'.format(path, loc, string.rstrip())
|
|
print(line)
|
|
if code_match:
|
|
print("-" * (len(path) + 17))
|
|
if block.is_comment() or (block.is_code() and code_match):
|
|
for line in block.content.splitlines(True):
|
|
if line_no == block.start[0]:
|
|
line = prefix + line
|
|
line = '{0} {1:11d} [{2}]: {3}'.format(path, line_no,
|
|
classifier,
|
|
line.rstrip())
|
|
print(line)
|
|
line_no += 1
|
|
|
|
|
|
def analyze_block(conf, path, block, first=False):
|
|
"""Print code/comment blocks."""
|
|
match = False
|
|
regexps = []
|
|
if conf.reg_exp is not None:
|
|
for regexp in conf.reg_exp:
|
|
if conf.case_insensitive:
|
|
regexps.append(re.compile(r'(' + regexp + ')', re.IGNORECASE))
|
|
else:
|
|
regexps.append(re.compile(r'(' + regexp + ')'))
|
|
if block.is_comment():
|
|
content = block.content
|
|
if conf.reg_exp_comments is not None:
|
|
for regexp in conf.reg_exp_comments:
|
|
if conf.case_insensitive:
|
|
regexps.append(
|
|
re.compile(r'(' + regexp + ')', re.IGNORECASE))
|
|
else:
|
|
regexps.append(re.compile(r'(' + regexp + ')'))
|
|
for regexp in regexps:
|
|
if regexp.search(block.content):
|
|
if conf.colorize:
|
|
content = regexp.sub(Fore.RED + r'\1' + Fore.RESET,
|
|
content)
|
|
match = True
|
|
if match:
|
|
block.content = content
|
|
print_block(conf, path, block, first)
|
|
elif block.is_code():
|
|
content = block.content
|
|
regexps_string = regexps.copy()
|
|
regexps_code = regexps.copy()
|
|
if conf.reg_exp_string_literals is not None:
|
|
for regexp in conf.reg_exp_string_literals:
|
|
if conf.case_insensitive:
|
|
regexps.append(
|
|
re.compile(r'(' + regexp + ')', re.IGNORECASE))
|
|
else:
|
|
regexps.append(re.compile(r'(' + regexp + ')'))
|
|
if conf.reg_exp_source is not None:
|
|
for regexp in conf.reg_exp_source:
|
|
if conf.case_insensitive:
|
|
regexps.append(
|
|
re.compile(r'(' + regexp + ')', re.IGNORECASE))
|
|
else:
|
|
regexps.append(re.compile(r'(' + regexp + ')'))
|
|
string_match = False
|
|
match_idxs = set()
|
|
string_literals = block.string_literals.copy()
|
|
|
|
if conf.join_string_literals:
|
|
joined_string = ""
|
|
for (_, string) in block.string_literals:
|
|
joined_string = joined_string + string
|
|
joined_cstring = joined_string
|
|
for regexp in regexps_string:
|
|
if regexp.search(joined_string):
|
|
if conf.colorize:
|
|
joined_cstring = regexp.sub(
|
|
Fore.BLUE + r'\1' + Fore.RESET, joined_cstring)
|
|
string_match = True
|
|
|
|
block.string_literals = [joined_cstring]
|
|
else:
|
|
for regexp in regexps_string:
|
|
for idx, (pos, string) in enumerate(block.string_literals):
|
|
if regexp.search(string):
|
|
if conf.colorize:
|
|
string_literals[idx] = ((
|
|
string_literals[idx])[0], regexp.sub(
|
|
Fore.BLUE + r'\1' + Fore.RESET,
|
|
(string_literals[idx])[1]))
|
|
match_idxs.add(idx)
|
|
string_match = True
|
|
block.string_literals = []
|
|
for idx, string in enumerate(string_literals):
|
|
if idx in match_idxs:
|
|
block.string_literals.append(string)
|
|
|
|
code_match = False
|
|
for regexp in regexps_code:
|
|
if regexp.search(block.content):
|
|
if conf.colorize:
|
|
content = regexp.sub(Fore.CYAN + r'\1' + Fore.RESET,
|
|
content)
|
|
code_match = True
|
|
match = string_match or code_match
|
|
block.content = content
|
|
if match:
|
|
print_block(conf, path, block, first, string_match, code_match)
|
|
return match
|
|
|
|
|
|
def analyze_crx(conf, crx, path=""):
|
|
"""Analyze crx file."""
|
|
match = False
|
|
with ZipFile(crx) as crxobj:
|
|
js_files = list(
|
|
filter(lambda x: is_file_with_c_style_comments(x.filename),
|
|
crxobj.infolist()))
|
|
for jsfile in js_files:
|
|
with crxobj.open(jsfile) as js_file_obj:
|
|
data = js_file_obj.read()
|
|
file_path = path + "/" + js_file_obj.name
|
|
if jsstrings_data(conf, file_path, data):
|
|
match = True
|
|
|
|
return match
|
|
|
|
|
|
def analyze_tar(conf, tarfilename):
|
|
last_crx_file = ''
|
|
extid = os.path.splitext(os.path.basename(tarfilename))[0]
|
|
from_dateobj = None
|
|
latest_dateobj = None
|
|
if conf.from_date is not None:
|
|
from_dateobj = dateutil.parser.parse(conf.from_date)
|
|
if from_dateobj.tzinfo is None or from_dateobj.tzinfo.utcoffset(
|
|
from_dateobj) is None:
|
|
from_dateobj = from_dateobj.replace(tzinfo=datetime.timezone.utc)
|
|
if conf.latest_date is not None:
|
|
latest_dateobj = dateutil.parser.parse(conf.latest_date)
|
|
if latest_dateobj.tzinfo is None or latest_dateobj.tzinfo.utcoffset(
|
|
latest_dateobj) is None:
|
|
latest_dateobj = latest_dateobj.replace(
|
|
tzinfo=datetime.timezone.utc)
|
|
|
|
match = False
|
|
|
|
if from_dateobj is None:
|
|
last_crx_file, _ = last_crx(
|
|
os.path.join(conf.archive_dir, "data"), extid, latest_dateobj)
|
|
if last_crx_file == "" or last_crx_file is None:
|
|
logging.warning("No crx in " + extid)
|
|
else:
|
|
with tarfile.open(tarfilename, 'r') as archive:
|
|
with archive.extractfile(last_crx_file) as crx:
|
|
match = analyze_crx(conf, crx, last_crx_file)
|
|
else:
|
|
if latest_dateobj is None:
|
|
# only from date is given
|
|
first_crx_file = first_crx(
|
|
os.path.join(conf.archive_dir, "data"), extid, from_dateobj)
|
|
if first_crx_file == "" or first_crx_file is None:
|
|
logging.warning("No crx in " + extid)
|
|
else:
|
|
with tarfile.open(tarfilename, 'r') as archive:
|
|
with archive.extractfile(first_crx_file) as crx:
|
|
match = analyze_crx(conf, crx, last_crx_file)
|
|
else:
|
|
# both dates are given
|
|
all_crx_files = all_crx(
|
|
os.path.join(conf.archive_dir, "data"), extid)
|
|
if not all_crx_files:
|
|
logging.warning("No crx in " + extid)
|
|
else:
|
|
with tarfile.open(tarfilename, 'r') as archive:
|
|
for crx_file in all_crx_files:
|
|
with archive.extractfile(crx_file) as crx:
|
|
match = analyze_crx(conf, crx,
|
|
last_crx_file) or match
|
|
return match
|
|
|
|
|
|
def analyze_file(conf, filename):
|
|
with open(filename, 'rb') as fileobj:
|
|
data = fileobj.read()
|
|
return jsstrings_data(conf, filename, data)
|
|
|
|
|
|
def compute_tasks(file_or_extids, taskid=1, maxtaskid=1):
|
|
"""Function for computing list of tasks."""
|
|
extid_re = re.compile(r'^[a-p]+$')
|
|
tasks = []
|
|
for file_or_extid in file_or_extids:
|
|
if is_file_with_c_style_comments(file_or_extid):
|
|
tasks.append(file_or_extid)
|
|
elif file_or_extid.endswith('.tar'):
|
|
tasks.append(file_or_extid)
|
|
elif file_or_extid.endswith('.crx'):
|
|
tasks.append(file_or_extid)
|
|
elif extid_re.match(file_or_extid):
|
|
tasks.append(file_or_extid)
|
|
else:
|
|
# default: a file with extension ide
|
|
with open(file_or_extid) as fileobj:
|
|
for line in fileobj:
|
|
line = line.strip()
|
|
if extid_re.match(line):
|
|
tasks.append(line)
|
|
|
|
chunksize = int(len(tasks) / maxtaskid)
|
|
if taskid == maxtaskid:
|
|
tasks = tasks[(taskid - 1) * chunksize:]
|
|
else:
|
|
tasks = tasks[(taskid - 1) * chunksize:taskid * chunksize]
|
|
|
|
return tasks
|
|
|
|
|
|
def analyze_task(conf, task):
|
|
"""Analyze one file/tar/crx/extid."""
|
|
logging.debug("Analyzing " + task)
|
|
extid_re = re.compile('^[a-p]+$')
|
|
if task.endswith('.crx'):
|
|
retval = analyze_crx(conf, task)
|
|
elif task.endswith('.tar'):
|
|
retval = analyze_tar(conf, task)
|
|
elif extid_re.match(task):
|
|
tarfilename = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
|
|
retval = analyze_tar(conf, conf.archive_dir + "/" + tarfilename)
|
|
else:
|
|
retval = analyze_file(conf, task)
|
|
return retval
|
|
|
|
|
|
def main(conf):
|
|
"""Main function: JavaScript strings on steroids."""
|
|
logger = logging.getLogger()
|
|
ch = logging.StreamHandler(sys.stdout)
|
|
ch.setFormatter(logging.Formatter(const_log_format()))
|
|
logger.addHandler(ch)
|
|
if conf.verbose:
|
|
logger.setLevel(logging.DEBUG)
|
|
else:
|
|
logger.setLevel(logging.WARNING)
|
|
|
|
if conf.colorize:
|
|
init()
|
|
|
|
tasks = compute_tasks(conf.FILE_OR_EXTID, conf.taskid, conf.max_taskid)
|
|
with Pool(conf.parallel) as p:
|
|
retvals = p.map(partial(analyze_task, conf), tasks)
|
|
return not reduce(operator.or_, retvals, False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main_parser = argparse.ArgumentParser(
|
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
description=
|
|
'A combination of strings and grep for JavaScript and CSS files.')
|
|
main_parser.add_argument(
|
|
'-r',
|
|
'--reg-exp',
|
|
metavar='REGEXP',
|
|
type=str,
|
|
nargs='+',
|
|
help='search for regular expression')
|
|
main_parser.add_argument(
|
|
'-v',
|
|
'--verbose',
|
|
action='store_true',
|
|
default=False,
|
|
help='increase verbosity')
|
|
main_parser.add_argument(
|
|
'-i',
|
|
'--case-insensitive',
|
|
action='store_true',
|
|
default=False,
|
|
help='match case insensitive')
|
|
|
|
main_parser.add_argument(
|
|
'-o',
|
|
'--output-decoration',
|
|
metavar='L',
|
|
choices=[0, 1, 2, 3],
|
|
type=int,
|
|
default=2,
|
|
help='select level L of details shown for matches:\n' +
|
|
' 0: exit 0 in case of match found, 1 otherwise\n' +
|
|
' 1: show only matching filename\n' +
|
|
' 2: show matching lines (NOT YET SUPPORTED)\n' +
|
|
' 3: show matching blocks')
|
|
main_parser.add_argument(
|
|
'-p',
|
|
'--parallel',
|
|
metavar='P',
|
|
type=int,
|
|
default=1,
|
|
help='run P threads in parallel')
|
|
|
|
main_parser.add_argument(
|
|
'-D',
|
|
'--latest-date',
|
|
metavar='DATE',
|
|
type=str,
|
|
help='select latest crx from tar, released before DATE.\n' +
|
|
'Together with --latest-date, specifies all crx released in specified\n'
|
|
+ 'date range.')
|
|
|
|
main_parser.add_argument(
|
|
'-d',
|
|
'--from-date',
|
|
metavar='DATE',
|
|
type=str,
|
|
help='select oldest crx from tar released after DATE.\n' +
|
|
'Together with --from-date, specifies all crx released in specified\n'
|
|
+ 'date range.')
|
|
|
|
main_parser.add_argument(
|
|
'-f',
|
|
'--file-pattern',
|
|
metavar='pattern',
|
|
type=str,
|
|
help='process only files matching pattern')
|
|
|
|
main_parser.add_argument(
|
|
'-a',
|
|
'--archive-dir',
|
|
metavar='archive',
|
|
type=str,
|
|
default=const_basedir(),
|
|
help='archive directory')
|
|
main_parser.add_argument(
|
|
'-C', '--colorize', action='store_true', help='use colors')
|
|
|
|
main_parser.add_argument(
|
|
'-n', '--taskid', metavar='n', type=int, default=1, help='task id')
|
|
main_parser.add_argument(
|
|
'-N',
|
|
'--max-taskid',
|
|
metavar='N',
|
|
type=int,
|
|
default=1,
|
|
help='max task id')
|
|
|
|
main_parser.add_argument(
|
|
'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
|
|
|
|
comment_group = main_parser.add_argument_group('comment blocks')
|
|
comment_group.add_argument(
|
|
'-g',
|
|
'--group-single-line-comments',
|
|
help='Group consecutive singe-line comments into blocks')
|
|
comment_group.add_argument(
|
|
'-c',
|
|
'--reg-exp-comments',
|
|
metavar='REGEXP',
|
|
type=str,
|
|
nargs='+',
|
|
help='search comments for regular expression')
|
|
|
|
source_group = main_parser.add_argument_group('source blocks')
|
|
source_group.add_argument(
|
|
'-b',
|
|
'--beautify',
|
|
action='store_true',
|
|
default=False,
|
|
help='beautify source code')
|
|
source_group.add_argument(
|
|
'-s',
|
|
'--reg-exp-source',
|
|
metavar='REGEXP',
|
|
type=str,
|
|
nargs='+',
|
|
help='search source for regular expression')
|
|
|
|
strings_group = main_parser.add_argument_group('string literals')
|
|
strings_group.add_argument(
|
|
'-j',
|
|
'--join-string-literals',
|
|
action='store_true',
|
|
help='join string literals (heuristic)')
|
|
strings_group.add_argument(
|
|
'-l',
|
|
'--reg-exp-string-literals',
|
|
metavar='REGEXP',
|
|
type=str,
|
|
nargs='+',
|
|
help='search string literals for regular expression')
|
|
main_conf = main_parser.parse_args()
|
|
|
|
sys.exit(main(main_conf))
|