ExtensionCrawler/crx-jsstrings

308 lines
9.9 KiB
Plaintext
Raw Normal View History

2017-09-01 13:12:05 +00:00
#!/usr/bin/env python3.5
#
# Copyright (C) 2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""Tool for extracting crx file from a tar archive."""
import collections
import datetime
import argparse
import io
import os
2017-09-22 07:01:09 +00:00
import logging
import re
import sys
import tarfile
import zlib
from functools import partial
from multiprocessing import Pool
from zipfile import ZipFile
import dateutil
import dateutil.parser
import jsbeautifier
2017-09-22 07:01:09 +00:00
from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import get_existing_ids, last_crx
2017-09-23 21:41:04 +00:00
from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import mince_js
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
2017-09-21 20:12:00 +00:00
JsStringsConfig = collections.namedtuple('JsStringsConfig', [
'comment', 'strings', 'group', 'program', 'beautify', 'basedir', 'regexp',
2017-09-22 07:01:09 +00:00
'parallel', "verbose"
2017-09-21 20:12:00 +00:00
])
def jsstrings_data(path, data, config):
match = False
print("## Analyzing " + path)
file_info = init_file_info(path, data)
2017-09-21 23:15:46 +00:00
if file_info['size'] == 0:
2017-09-22 07:44:10 +00:00
return match
if not file_info['dec_encoding'] is None:
try:
2017-09-22 07:44:10 +00:00
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
dec_data = dec.decompress(data, 100 * file_info['size'])
2017-09-22 07:01:09 +00:00
if file_info['dec_encoding'] is None:
2017-09-23 21:41:04 +00:00
logging.warning("Encoding is None for " + path +
" using utf-8.")
2017-09-22 07:44:10 +00:00
str_data = dec_data.decode('UTF-8')
2017-09-22 07:01:09 +00:00
else:
str_data = dec_data.decode(file_info['dec_encoding'])
del dec_data
except Exception:
2017-09-22 07:44:10 +00:00
return match
else:
2017-09-22 07:44:10 +00:00
if file_info['encoding'] is None:
2017-09-23 21:41:04 +00:00
logging.warning("Encoding is None for " + path + " using utf-8.")
2017-09-22 07:44:10 +00:00
str_data = data.decode('UTF-8')
else:
str_data = data.decode(file_info['encoding'])
2017-09-23 21:41:04 +00:00
if config.beautify:
str_data = jsbeautifier.beautify(str_data)
with io.StringIO(str_data) as str_obj:
2017-09-21 20:12:00 +00:00
for block in mince_js(
str_obj, single_line_comments_block=config.group):
2017-09-23 21:41:04 +00:00
if analyze_block(True, config.comment, config.program,
config.strings, config.regexp, block):
match = True
return match
def analyze_block(verbose, comment, program, strings, regexp, block):
"""Print code/comment blocks."""
match = False
2017-09-23 21:41:04 +00:00
rgx = None
if regexp is not None:
rgx = re.compile(regexp)
if comment and block.is_comment():
if regexp is None or rgx.match(block.content):
if verbose:
print(block)
match = True
elif block.is_code():
if program:
if regexp is None or rgx.match(block.content):
if verbose:
print(block)
match = True
if strings:
for string in block.string_literals:
if regexp is None or rgx.match(string):
if verbose:
print(string)
match = True
return match
def analyze_crx(config, crx, path):
match = False
if path is None:
with ZipFile(crx) as crxobj:
js_files = list(
filter(
lambda x: x.filename.endswith(".js") or x.filename.endswith(".js.gz") or x.filename.endswith(".jgz") or x.filename.endswith(".jsg") or x.filename.endswith(".css.gz"),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
if jsstrings_data(path, data, config):
match = True
else:
with ZipFile(crx) as crxobj:
with crxobj.open(path) as js_file:
data = js_file.read()
match = jsstrings_data(path, data, config)
return match
def analyze_tar(config, date, path, filename):
last_crx_file = ''
match = False
extid = os.path.splitext(os.path.basename(filename))[0]
if date is not None:
dateobj = dateutil.parser.parse(date)
if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
last_crx_file = last_crx(
os.path.join(config.basedir, "data"), extid, dateobj)
else:
last_crx_file = last_crx(os.path.join(config.basedir, "data"), extid)
2017-09-21 23:55:49 +00:00
if last_crx_file == "" or last_crx_file is None:
2017-09-23 21:41:04 +00:00
print("No crx in " + extid)
2017-09-21 23:55:49 +00:00
else:
print("# Start analyzing " + extid)
with tarfile.open(filename, 'r') as archive:
with archive.extractfile(last_crx_file) as crx:
match = analyze_crx(config, crx, path)
if match:
print("RegExp found in " + extid)
else:
print("RegExp not found in " + extid)
2017-09-23 21:41:04 +00:00
def process_group(config, taskid, maxtaskid, date, path):
archive_dir = os.path.join(config.basedir, "data")
ext_ids = get_existing_ids(archive_dir)
chunksize = int(len(ext_ids) / maxtaskid)
if taskid == maxtaskid:
ext_ids = ext_ids[(taskid - 1) * chunksize:]
else:
ext_ids = ext_ids[(taskid - 1) * chunksize:taskid * chunksize]
ext_ids = list(map(partial(archive_file, archive_dir), ext_ids))
with Pool(config.parallel) as p:
p.map(partial(analyze_tar, config, date, path), ext_ids)
def main(args):
"""Main function: JavaScript strings on steroids."""
config = JsStringsConfig(
2017-09-21 20:12:00 +00:00
comment=True,
strings=False,
group=False,
program=False,
beautify=False,
basedir=const_basedir(),
regexp=None,
2017-09-22 07:44:10 +00:00
parallel=1,
2017-09-22 07:01:09 +00:00
verbose=True)
filename = None
path = None
2017-09-20 16:48:03 +00:00
date = None
taskid = -1
maxtaskid = -1
extid_re = re.compile('^[a-p]+$')
2017-09-21 20:12:00 +00:00
2017-09-22 07:01:09 +00:00
if config.verbose:
loglevel = logging.INFO
else:
loglevel = logging.WARNING
logger = logging.getLogger()
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(logging.Formatter(const_log_format()))
logger.addHandler(ch)
logger.setLevel(loglevel)
if taskid > 0 and maxtaskid > 0:
process_group(config, taskid, maxtaskid, date, path)
else:
if filename.endswith('.crx'):
analyze_crx(config, filename, path)
elif filename.endswith('.tar'):
analyze_tar(config, date, path, filename)
elif extid_re.match(filename):
extid = filename
filename = os.path.join(config.basedir, 'data',
get_local_archive_dir(extid),
extid + ".tar")
analyze_tar(config, date, path, filename)
else:
with open(filename, 'rb') as fileobj:
data = fileobj.read()
jsstrings_data(filename, data, config)
2017-09-21 20:12:00 +00:00
if __name__ == "__main__":
main_parser = argparse.ArgumentParser(
description=
'A combination of strings and grep for JavaScript and CSS files.')
main_parser.add_argument(
'-r',
'--reg-exp',
metavar='REGEXP',
type=str,
nargs='+',
help='search for regular expression')
main_parser.add_argument(
'-d',
'--output-decoration',
metavar='L',
choices=[0, 1, 2, 3],
type=int,
help='show only matching files, crx, tar')
main_parser.add_argument(
'-a',
'--archive-dir',
metavar='archive',
type=str,
help='archive directory')
main_parser.add_argument(
'-C', '--colorize', action='store_true', help='use colors')
main_parser.add_argument(
2017-10-01 07:39:44 +00:00
'-n', '--taskid', metavar='n', type=int, help='task id')
main_parser.add_argument(
2017-10-01 07:39:44 +00:00
'-N', '--max-taskid', metavar='N', type=int, help='max task id')
2017-10-01 07:39:44 +00:00
main_parser.add_argument(
'FILE_OR_EXTID', nargs='+', help="extid/js/css/crx/tar file")
comment_group = main_parser.add_argument_group('comment blocks')
2017-10-01 07:39:44 +00:00
comment_group.add_argument(
'-g',
'--group-single-line-comments',
help='Group consecutive singe-line comments into blocks')
comment_group.add_argument(
'-c',
'--reg-exp-comments',
metavar='REGEXP',
type=str,
nargs='+',
help='search comments for regular expression')
source_group = main_parser.add_argument_group('source blocks')
source_group.add_argument(
'-b', '--beautify', action='store_true', help='beautify source code')
source_group.add_argument(
'-s',
'--reg-exp-source',
metavar='REGEXP',
type=str,
nargs='+',
help='search source for regular expression')
strings_group = main_parser.add_argument_group('string literals')
2017-09-29 04:46:25 +00:00
strings_group.add_argument(
2017-10-01 07:39:44 +00:00
'-j',
'--join-string-literals',
action='store_true',
help='join string literals (heuristic)')
2017-09-29 04:46:25 +00:00
strings_group.add_argument(
'-l',
'--reg-exp-string-literals',
metavar='REGEXP',
type=str,
nargs='+',
help='search string literals for regular expression')
args = main_parser.parse_args()
2017-09-29 04:46:25 +00:00
print(args.FILE_OR_EXTID)
main(args)