#!/usr/bin/env python3.5 # # Copyright (C) 2017 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # """Tool for extracting crx file from a tar archive.""" import getopt import io import re import sys import zlib from io import StringIO from zipfile import ZipFile import collections import cchardet as chardet import jsbeautifier from ExtensionCrawler.js_decomposer import init_file_info from ExtensionCrawler.js_mincer import JsBlockType, mince_js # Script should run with python 3.4 or 3.5 assert sys.version_info >= (3, 4) and sys.version_info < (3, 6) JsStringsConfig = collections.namedtuple('JsStringsConfig', [ 'comment', 'strings', 'group', 'program', 'beautify', 'regexp' ]) def jsstrings_data(path, data, config): file_info = init_file_info(path, data) if not file_info['dec_encoding'] is None: try: with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec: dec_data = dec.decompress(data, 100 * file_info['size']) str_data = dec_data.decode(file_info['dec_encoding']) del dec_data except Exception: return [file_info] else: str_data = data.decode(file_info['encoding']) if config.beautify: str_data = jsbeautifier.beautify(str_data) with StringIO(str_data) as str_obj: for block in mince_js( str_obj, single_line_comments_block=config.group): print_block(config.comment, config.program, config.strings, config.regexp, block) def helpmsg(): """Print help message.""" print("crx-jsstrings [OPTION] [crx-file] js-file") print(" -h print this help text") print(" -n no comments") print(" -s strings") print(" -g group single line comments") print(" -c program code") print(" -b beautify JavaScript files before analyzing them") print( " -r regexp select only comments/code/strings where regexp matches") print( " -d date use latest extension that was released not later than date (only for tar archives)" ) def print_block(comment, program, strings, regexp, block): """Print code/comment blocks.""" rgx = None if regexp is not None: rgx = re.compile(regexp) if comment and block.is_comment(): if regexp is None or rgx.match(block.content): print(block) elif block.is_code(): if program: if regexp is None or rgx.match(block.content): print(block) if strings: for string in block.string_literals: if regexp is None or rgx.match(string): print(string) def main(argv): """Main function: JavaScript strings on steroids.""" config = JsStringsConfig( comment=True, strings=False, group=False, program=False, beautify=False, regexp=None) filename = None path = None date = None try: opts, args = getopt.getopt(argv, "hbcd:snvr:", ["--regesp", "--date", "--beautify"]) except getopt.GetoptError: helpmsg() sys.exit(2) for opt, arg in opts: if opt == '-h': helpmsg() sys.exit() elif opt == '-n': config = config._replace(comment=False) elif opt == '-s': config = config._replace(strings=True) elif opt == '-g': config = config._replace(group=True) elif opt == '-c': config = config._replace(program=True) elif opt in ('-b', "--beautify"): config = config._replace(beautify=True) elif opt in ('-r', "--regexp"): config = config._replace(regexp=arg) elif opt in ('-r', "--date"): date = arg if len(args) == 1: filename = args[0] elif len(args) == 2: filename = args[0] path = args[1] else: helpmsg() sys.exit() if filename.endswith('.crx') and path is not None: with ZipFile(filename) as crxobj: with crxobj.open(path) as js_file: data = js_file.read() jsstrings_data(path, data, config) elif filename.endswith('.crx') and path is None: with ZipFile(filename) as crxobj: js_files = list( filter(lambda x: x.filename.endswith(".js"), crxobj.infolist())) for jsfile in js_files: with crxobj.open(jsfile) as js_file_obj: data = js_file_obj.read() path = js_file_obj.name jsstrings_data(path, data, config) elif filename.endswith('.tar') and path is not None: pass elif filename.endswith('.tar') and path is None: pass else: with open(filename, 'rb') as fileobj: data = fileobj.read() jsstrings_data(filename, data, config) if __name__ == "__main__": main(sys.argv[1:])