Integreated js_mincer into decomposition analysis to allow, in the future, to check comments, code, and string literals explicitely.

This commit is contained in:
Achim D. Brucker 2017-08-28 10:40:37 +01:00
parent 9ef27f9ac9
commit f10923af03
1 changed files with 48 additions and 44 deletions

View File

@ -18,10 +18,12 @@
general and Chrome extensions in particular."""
import os
import io
import re
import json
from enum import Enum
import hashlib
from ExtensionCrawler.js_mincer import mince_js
class DetectionType(Enum):
"""Enumeration for detection types."""
@ -67,21 +69,21 @@ def unknown_lib_identifiers():
"""List of identifiers for generic library version headers."""
return ([
re.compile(
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
re.compile(
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
re.compile(
rb'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
re.compile(
rb'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
re.IGNORECASE),
re.compile(rb'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
re.compile(r'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
re.IGNORECASE)
])
@ -123,28 +125,6 @@ def analyse_known_filename(zipfile, js_file):
libs.append(js_info)
return libs
def analyse_known_filecontent(zipfile, js_file):
"""Check for known file content (license headers)."""
libs = list()
data = ""
with zipfile.open(js_file) as js_file_obj:
data = js_file_obj.read()
for lib, regex in lib_identifiers().items():
if 'filecontent' in regex:
#iterate over the filecontent regexes for this to see if it has a match
for file_content in regex['filecontent']:
lib_matched = re.finditer(file_content.encode(), data,
re.IGNORECASE)
for match in lib_matched:
ver = match.group(2).decode()
js_info = init_jsinfo(zipfile, js_file)
js_info['lib'] = lib
js_info['ver'] = ver
js_info['type'] = FileClassification.LIBRARY
js_info['detectMethod'] = DetectionType.FILECONTENT
libs.append(js_info)
return libs
def analyse_generic_filename(zipfile, js_file):
"""Check for generic file name patterns."""
libs = list()
@ -159,30 +139,57 @@ def analyse_generic_filename(zipfile, js_file):
libs.append(js_info)
return libs
def analyse_generic_filecontent(zipfile, js_file):
"""Check for generic file content (license headers)."""
def analyse_filename(zipfile, js_file):
"""Check for file name patterns of libraries (known and generic as fall back)`"""
res = analyse_known_filename(zipfile, js_file)
if not res:
res = analyse_generic_filename(zipfile, js_file)
return res
def analyse_comment_known_libs(zipfile, js_file, comment):
"""Search for library specific identifiers in comment block."""
libs = list()
data = ""
with zipfile.open(js_file) as js_file_obj:
data = js_file_obj.read()
for unkregex in unknown_lib_identifiers():
unkown_lib_matched = unkregex.finditer(data)
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
js_info = init_jsinfo(zipfile, js_file)
js_info['lib'] = ((js_file.filename).replace(
'.js', '')).replace('.min', '')
js_info['ver'] = match.group(2).decode()
js_info['ver'] = match.group(2)
js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT
js_info['type'] = FileClassification.LIKELY_LIBRARY
libs.append(js_info)
return libs
def analyse_filename(zipfile, js_file):
"""Check for file name patterns of libraries (known and generic as fall back)`"""
res = analyse_known_filename(zipfile, js_file)
if not res:
res = analyse_generic_filecontent(zipfile, js_file)
return res
def analyse_comment_generic_libs(zipfile, js_file, comment):
"""Search for generic identifiers in comment block."""
libs = list()
for unkregex in unknown_lib_identifiers():
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
js_info = init_jsinfo(zipfile, js_file)
js_info['lib'] = ((js_file.filename).replace(
'.js', '')).replace('.min', '')
js_info['ver'] = match.group(2)
js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT
js_info['type'] = FileClassification.LIKELY_LIBRARY
libs.append(js_info)
return libs
def analyse_comment_blocks(zipfile, js_file):
"""Search for library identifiers in comment."""
libs = list()
with zipfile.open(js_file) as js_file_obj:
with io.TextIOWrapper(js_file_obj, 'utf-8') as js_text_file_obj:
for block in mince_js(js_text_file_obj, single_line_comments_block=True):
block_libs = list()
if block.is_comment():
block_libs = analyse_comment_known_libs(zipfile, js_file, block)
if block_libs is None:
block_libs = analyse_comment_generic_libs(zipfile, js_file, block)
libs += block_libs
return libs
def decompose_js(zipfile):
"""JavaScript decomposition analysis for extensions."""
@ -196,11 +203,8 @@ def decompose_js(zipfile):
js_inventory = []
for js_file in list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist())):
js_info_file = analyse_filename(zipfile, js_file)
js_info_file += analyse_generic_filecontent(zipfile, js_file)
js_info_file += analyse_known_filecontent(zipfile, js_file)
js_info_file += analyse_comment_blocks(zipfile, js_file)
if not js_info_file:
# if no library could be detected, we report the JavaScript file as 'application'.