diff --git a/ExtensionCrawler/js_decomposer.py b/ExtensionCrawler/js_decomposer.py index 733e7e8..97bff25 100755 --- a/ExtensionCrawler/js_decomposer.py +++ b/ExtensionCrawler/js_decomposer.py @@ -18,10 +18,12 @@ general and Chrome extensions in particular.""" import os +import io import re import json from enum import Enum import hashlib +from ExtensionCrawler.js_mincer import mince_js class DetectionType(Enum): """Enumeration for detection types.""" @@ -67,21 +69,21 @@ def unknown_lib_identifiers(): """List of identifiers for generic library version headers.""" return ([ re.compile( - rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', + r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', re.IGNORECASE ), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8 re.compile( - rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)', + r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)', re.IGNORECASE ), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8 re.compile( - rb'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', + r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', re.IGNORECASE ), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc. re.compile( - rb'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?', + r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?', re.IGNORECASE), - re.compile(rb'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)', + re.compile(r'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)', re.IGNORECASE) ]) @@ -123,28 +125,6 @@ def analyse_known_filename(zipfile, js_file): libs.append(js_info) return libs -def analyse_known_filecontent(zipfile, js_file): - """Check for known file content (license headers).""" - libs = list() - data = "" - with zipfile.open(js_file) as js_file_obj: - data = js_file_obj.read() - for lib, regex in lib_identifiers().items(): - if 'filecontent' in regex: - #iterate over the filecontent regexes for this to see if it has a match - for file_content in regex['filecontent']: - lib_matched = re.finditer(file_content.encode(), data, - re.IGNORECASE) - for match in lib_matched: - ver = match.group(2).decode() - js_info = init_jsinfo(zipfile, js_file) - js_info['lib'] = lib - js_info['ver'] = ver - js_info['type'] = FileClassification.LIBRARY - js_info['detectMethod'] = DetectionType.FILECONTENT - libs.append(js_info) - return libs - def analyse_generic_filename(zipfile, js_file): """Check for generic file name patterns.""" libs = list() @@ -159,30 +139,57 @@ def analyse_generic_filename(zipfile, js_file): libs.append(js_info) return libs -def analyse_generic_filecontent(zipfile, js_file): - """Check for generic file content (license headers).""" +def analyse_filename(zipfile, js_file): + """Check for file name patterns of libraries (known and generic as fall back)`""" + res = analyse_known_filename(zipfile, js_file) + if not res: + res = analyse_generic_filename(zipfile, js_file) + return res + + +def analyse_comment_known_libs(zipfile, js_file, comment): + """Search for library specific identifiers in comment block.""" libs = list() - data = "" - with zipfile.open(js_file) as js_file_obj: - data = js_file_obj.read() for unkregex in unknown_lib_identifiers(): - unkown_lib_matched = unkregex.finditer(data) + unkown_lib_matched = unkregex.finditer(comment.content) for match in unkown_lib_matched: js_info = init_jsinfo(zipfile, js_file) js_info['lib'] = ((js_file.filename).replace( '.js', '')).replace('.min', '') - js_info['ver'] = match.group(2).decode() + js_info['ver'] = match.group(2) js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT js_info['type'] = FileClassification.LIKELY_LIBRARY libs.append(js_info) return libs -def analyse_filename(zipfile, js_file): - """Check for file name patterns of libraries (known and generic as fall back)`""" - res = analyse_known_filename(zipfile, js_file) - if not res: - res = analyse_generic_filecontent(zipfile, js_file) - return res +def analyse_comment_generic_libs(zipfile, js_file, comment): + """Search for generic identifiers in comment block.""" + libs = list() + for unkregex in unknown_lib_identifiers(): + unkown_lib_matched = unkregex.finditer(comment.content) + for match in unkown_lib_matched: + js_info = init_jsinfo(zipfile, js_file) + js_info['lib'] = ((js_file.filename).replace( + '.js', '')).replace('.min', '') + js_info['ver'] = match.group(2) + js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT + js_info['type'] = FileClassification.LIKELY_LIBRARY + libs.append(js_info) + return libs + +def analyse_comment_blocks(zipfile, js_file): + """Search for library identifiers in comment.""" + libs = list() + with zipfile.open(js_file) as js_file_obj: + with io.TextIOWrapper(js_file_obj, 'utf-8') as js_text_file_obj: + for block in mince_js(js_text_file_obj, single_line_comments_block=True): + block_libs = list() + if block.is_comment(): + block_libs = analyse_comment_known_libs(zipfile, js_file, block) + if block_libs is None: + block_libs = analyse_comment_generic_libs(zipfile, js_file, block) + libs += block_libs + return libs def decompose_js(zipfile): """JavaScript decomposition analysis for extensions.""" @@ -196,11 +203,8 @@ def decompose_js(zipfile): js_inventory = [] for js_file in list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist())): - js_info_file = analyse_filename(zipfile, js_file) - - js_info_file += analyse_generic_filecontent(zipfile, js_file) - js_info_file += analyse_known_filecontent(zipfile, js_file) + js_info_file += analyse_comment_blocks(zipfile, js_file) if not js_info_file: # if no library could be detected, we report the JavaScript file as 'application'.