ExtensionCrawler/crx-jsstrings

175 lines
5.5 KiB
Plaintext
Raw Normal View History

2017-09-01 13:12:05 +00:00
#!/usr/bin/env python3.5
#
# Copyright (C) 2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""Tool for extracting crx file from a tar archive."""
import getopt
import io
import re
import sys
import zlib
from io import StringIO
from zipfile import ZipFile
import collections
import cchardet as chardet
import jsbeautifier
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import JsBlockType, mince_js
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
2017-09-21 20:12:00 +00:00
JsStringsConfig = collections.namedtuple('JsStringsConfig', [
'comment', 'strings', 'group', 'program', 'beautify', 'regexp'
])
def jsstrings_data(path, data, config):
file_info = init_file_info(path, data)
if not file_info['dec_encoding'] is None:
try:
with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
dec_data = dec.decompress(data, 100 * file_info['size'])
str_data = dec_data.decode(file_info['dec_encoding'])
del dec_data
except Exception:
return [file_info]
else:
str_data = data.decode(file_info['encoding'])
if config.beautify:
str_data = jsbeautifier.beautify(str_data)
with StringIO(str_data) as str_obj:
2017-09-21 20:12:00 +00:00
for block in mince_js(
str_obj, single_line_comments_block=config.group):
print_block(config.comment, config.program, config.strings,
config.regexp, block)
def helpmsg():
"""Print help message."""
print("crx-jsstrings [OPTION] [crx-file] js-file")
print(" -h print this help text")
print(" -n no comments")
print(" -s strings")
print(" -g group single line comments")
print(" -c program code")
2017-09-20 16:52:51 +00:00
print(" -b beautify JavaScript files before analyzing them")
2017-09-21 20:12:00 +00:00
print(
" -r regexp select only comments/code/strings where regexp matches")
print(
" -d date use latest extension that was released not later than date (only for tar archives)"
)
def print_block(comment, program, strings, regexp, block):
"""Print code/comment blocks."""
rgx = None
if regexp is not None:
rgx = re.compile(regexp)
if comment and block.is_comment():
if regexp is None or rgx.match(block.content):
print(block)
elif block.is_code():
if program:
if regexp is None or rgx.match(block.content):
2017-09-21 20:12:00 +00:00
print(block)
if strings:
for string in block.string_literals:
if regexp is None or rgx.match(string):
print(string)
def main(argv):
"""Main function: JavaScript strings on steroids."""
config = JsStringsConfig(
2017-09-21 20:12:00 +00:00
comment=True,
strings=False,
group=False,
program=False,
beautify=False,
regexp=None)
filename = None
path = None
2017-09-20 16:48:03 +00:00
date = None
2017-09-21 20:12:00 +00:00
try:
2017-09-21 20:12:00 +00:00
opts, args = getopt.getopt(argv, "hbcd:snvr:",
["--regesp", "--date", "--beautify"])
except getopt.GetoptError:
helpmsg()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
helpmsg()
sys.exit()
elif opt == '-n':
2017-09-21 20:12:00 +00:00
config = config._replace(comment=False)
elif opt == '-s':
2017-09-21 20:12:00 +00:00
config = config._replace(strings=True)
elif opt == '-g':
2017-09-21 20:12:00 +00:00
config = config._replace(group=True)
elif opt == '-c':
2017-09-21 20:12:00 +00:00
config = config._replace(program=True)
2017-09-20 16:52:51 +00:00
elif opt in ('-b', "--beautify"):
2017-09-21 20:12:00 +00:00
config = config._replace(beautify=True)
elif opt in ('-r', "--regexp"):
2017-09-21 20:12:00 +00:00
config = config._replace(regexp=arg)
2017-09-20 16:48:03 +00:00
elif opt in ('-r', "--date"):
date = arg
2017-09-21 20:12:00 +00:00
if len(args) == 1:
filename = args[0]
elif len(args) == 2:
filename = args[0]
path = args[1]
else:
helpmsg()
sys.exit()
if filename.endswith('.crx') and path is not None:
with ZipFile(filename) as crxobj:
with crxobj.open(path) as js_file:
data = js_file.read()
jsstrings_data(path, data, config)
elif filename.endswith('.crx') and path is None:
with ZipFile(filename) as crxobj:
2017-09-21 20:12:00 +00:00
js_files = list(
filter(lambda x: x.filename.endswith(".js"),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
jsstrings_data(path, data, config)
2017-09-20 16:48:03 +00:00
elif filename.endswith('.tar') and path is not None:
pass
elif filename.endswith('.tar') and path is None:
pass
else:
with open(filename, 'rb') as fileobj:
data = fileobj.read()
jsstrings_data(filename, data, config)
2017-09-21 20:12:00 +00:00
if __name__ == "__main__":
main(sys.argv[1:])