Integreated js_mincer into decomposition analysis to allow, in the future, to check comments, code, and string literals explicitely.

This commit is contained in:
Achim D. Brucker 2017-08-28 10:40:37 +01:00
parent 9ef27f9ac9
commit f10923af03
1 changed files with 48 additions and 44 deletions

View File

@ -18,10 +18,12 @@
general and Chrome extensions in particular.""" general and Chrome extensions in particular."""
import os import os
import io
import re import re
import json import json
from enum import Enum from enum import Enum
import hashlib import hashlib
from ExtensionCrawler.js_mincer import mince_js
class DetectionType(Enum): class DetectionType(Enum):
"""Enumeration for detection types.""" """Enumeration for detection types."""
@ -67,21 +69,21 @@ def unknown_lib_identifiers():
"""List of identifiers for generic library version headers.""" """List of identifiers for generic library version headers."""
return ([ return ([
re.compile( re.compile(
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE re.IGNORECASE
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8 ), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
re.compile( re.compile(
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)', r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
re.IGNORECASE re.IGNORECASE
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8 ), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
re.compile( re.compile(
rb'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)', r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE re.IGNORECASE
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc. ), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
re.compile( re.compile(
rb'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?', r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
re.IGNORECASE), re.IGNORECASE),
re.compile(rb'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)', re.compile(r'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
re.IGNORECASE) re.IGNORECASE)
]) ])
@ -123,28 +125,6 @@ def analyse_known_filename(zipfile, js_file):
libs.append(js_info) libs.append(js_info)
return libs return libs
def analyse_known_filecontent(zipfile, js_file):
"""Check for known file content (license headers)."""
libs = list()
data = ""
with zipfile.open(js_file) as js_file_obj:
data = js_file_obj.read()
for lib, regex in lib_identifiers().items():
if 'filecontent' in regex:
#iterate over the filecontent regexes for this to see if it has a match
for file_content in regex['filecontent']:
lib_matched = re.finditer(file_content.encode(), data,
re.IGNORECASE)
for match in lib_matched:
ver = match.group(2).decode()
js_info = init_jsinfo(zipfile, js_file)
js_info['lib'] = lib
js_info['ver'] = ver
js_info['type'] = FileClassification.LIBRARY
js_info['detectMethod'] = DetectionType.FILECONTENT
libs.append(js_info)
return libs
def analyse_generic_filename(zipfile, js_file): def analyse_generic_filename(zipfile, js_file):
"""Check for generic file name patterns.""" """Check for generic file name patterns."""
libs = list() libs = list()
@ -159,30 +139,57 @@ def analyse_generic_filename(zipfile, js_file):
libs.append(js_info) libs.append(js_info)
return libs return libs
def analyse_generic_filecontent(zipfile, js_file): def analyse_filename(zipfile, js_file):
"""Check for generic file content (license headers).""" """Check for file name patterns of libraries (known and generic as fall back)`"""
res = analyse_known_filename(zipfile, js_file)
if not res:
res = analyse_generic_filename(zipfile, js_file)
return res
def analyse_comment_known_libs(zipfile, js_file, comment):
"""Search for library specific identifiers in comment block."""
libs = list() libs = list()
data = ""
with zipfile.open(js_file) as js_file_obj:
data = js_file_obj.read()
for unkregex in unknown_lib_identifiers(): for unkregex in unknown_lib_identifiers():
unkown_lib_matched = unkregex.finditer(data) unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched: for match in unkown_lib_matched:
js_info = init_jsinfo(zipfile, js_file) js_info = init_jsinfo(zipfile, js_file)
js_info['lib'] = ((js_file.filename).replace( js_info['lib'] = ((js_file.filename).replace(
'.js', '')).replace('.min', '') '.js', '')).replace('.min', '')
js_info['ver'] = match.group(2).decode() js_info['ver'] = match.group(2)
js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT
js_info['type'] = FileClassification.LIKELY_LIBRARY js_info['type'] = FileClassification.LIKELY_LIBRARY
libs.append(js_info) libs.append(js_info)
return libs return libs
def analyse_filename(zipfile, js_file): def analyse_comment_generic_libs(zipfile, js_file, comment):
"""Check for file name patterns of libraries (known and generic as fall back)`""" """Search for generic identifiers in comment block."""
res = analyse_known_filename(zipfile, js_file) libs = list()
if not res: for unkregex in unknown_lib_identifiers():
res = analyse_generic_filecontent(zipfile, js_file) unkown_lib_matched = unkregex.finditer(comment.content)
return res for match in unkown_lib_matched:
js_info = init_jsinfo(zipfile, js_file)
js_info['lib'] = ((js_file.filename).replace(
'.js', '')).replace('.min', '')
js_info['ver'] = match.group(2)
js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT
js_info['type'] = FileClassification.LIKELY_LIBRARY
libs.append(js_info)
return libs
def analyse_comment_blocks(zipfile, js_file):
"""Search for library identifiers in comment."""
libs = list()
with zipfile.open(js_file) as js_file_obj:
with io.TextIOWrapper(js_file_obj, 'utf-8') as js_text_file_obj:
for block in mince_js(js_text_file_obj, single_line_comments_block=True):
block_libs = list()
if block.is_comment():
block_libs = analyse_comment_known_libs(zipfile, js_file, block)
if block_libs is None:
block_libs = analyse_comment_generic_libs(zipfile, js_file, block)
libs += block_libs
return libs
def decompose_js(zipfile): def decompose_js(zipfile):
"""JavaScript decomposition analysis for extensions.""" """JavaScript decomposition analysis for extensions."""
@ -196,11 +203,8 @@ def decompose_js(zipfile):
js_inventory = [] js_inventory = []
for js_file in list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist())): for js_file in list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist())):
js_info_file = analyse_filename(zipfile, js_file) js_info_file = analyse_filename(zipfile, js_file)
js_info_file += analyse_comment_blocks(zipfile, js_file)
js_info_file += analyse_generic_filecontent(zipfile, js_file)
js_info_file += analyse_known_filecontent(zipfile, js_file)
if not js_info_file: if not js_info_file:
# if no library could be detected, we report the JavaScript file as 'application'. # if no library could be detected, we report the JavaScript file as 'application'.