Integreated js_mincer into decomposition analysis to allow, in the future, to check comments, code, and string literals explicitely.
This commit is contained in:
parent
9ef27f9ac9
commit
f10923af03
|
@ -18,10 +18,12 @@
|
||||||
general and Chrome extensions in particular."""
|
general and Chrome extensions in particular."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import io
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from ExtensionCrawler.js_mincer import mince_js
|
||||||
|
|
||||||
class DetectionType(Enum):
|
class DetectionType(Enum):
|
||||||
"""Enumeration for detection types."""
|
"""Enumeration for detection types."""
|
||||||
|
@ -67,21 +69,21 @@ def unknown_lib_identifiers():
|
||||||
"""List of identifiers for generic library version headers."""
|
"""List of identifiers for generic library version headers."""
|
||||||
return ([
|
return ([
|
||||||
re.compile(
|
re.compile(
|
||||||
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
||||||
re.IGNORECASE
|
re.IGNORECASE
|
||||||
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
|
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
|
||||||
re.compile(
|
re.compile(
|
||||||
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
|
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
|
||||||
re.IGNORECASE
|
re.IGNORECASE
|
||||||
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
|
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
|
||||||
re.compile(
|
re.compile(
|
||||||
rb'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
||||||
re.IGNORECASE
|
re.IGNORECASE
|
||||||
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
|
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
|
||||||
re.compile(
|
re.compile(
|
||||||
rb'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
|
r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
|
||||||
re.IGNORECASE),
|
re.IGNORECASE),
|
||||||
re.compile(rb'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
|
re.compile(r'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
])
|
])
|
||||||
|
|
||||||
|
@ -123,28 +125,6 @@ def analyse_known_filename(zipfile, js_file):
|
||||||
libs.append(js_info)
|
libs.append(js_info)
|
||||||
return libs
|
return libs
|
||||||
|
|
||||||
def analyse_known_filecontent(zipfile, js_file):
|
|
||||||
"""Check for known file content (license headers)."""
|
|
||||||
libs = list()
|
|
||||||
data = ""
|
|
||||||
with zipfile.open(js_file) as js_file_obj:
|
|
||||||
data = js_file_obj.read()
|
|
||||||
for lib, regex in lib_identifiers().items():
|
|
||||||
if 'filecontent' in regex:
|
|
||||||
#iterate over the filecontent regexes for this to see if it has a match
|
|
||||||
for file_content in regex['filecontent']:
|
|
||||||
lib_matched = re.finditer(file_content.encode(), data,
|
|
||||||
re.IGNORECASE)
|
|
||||||
for match in lib_matched:
|
|
||||||
ver = match.group(2).decode()
|
|
||||||
js_info = init_jsinfo(zipfile, js_file)
|
|
||||||
js_info['lib'] = lib
|
|
||||||
js_info['ver'] = ver
|
|
||||||
js_info['type'] = FileClassification.LIBRARY
|
|
||||||
js_info['detectMethod'] = DetectionType.FILECONTENT
|
|
||||||
libs.append(js_info)
|
|
||||||
return libs
|
|
||||||
|
|
||||||
def analyse_generic_filename(zipfile, js_file):
|
def analyse_generic_filename(zipfile, js_file):
|
||||||
"""Check for generic file name patterns."""
|
"""Check for generic file name patterns."""
|
||||||
libs = list()
|
libs = list()
|
||||||
|
@ -159,30 +139,57 @@ def analyse_generic_filename(zipfile, js_file):
|
||||||
libs.append(js_info)
|
libs.append(js_info)
|
||||||
return libs
|
return libs
|
||||||
|
|
||||||
def analyse_generic_filecontent(zipfile, js_file):
|
def analyse_filename(zipfile, js_file):
|
||||||
"""Check for generic file content (license headers)."""
|
"""Check for file name patterns of libraries (known and generic as fall back)`"""
|
||||||
|
res = analyse_known_filename(zipfile, js_file)
|
||||||
|
if not res:
|
||||||
|
res = analyse_generic_filename(zipfile, js_file)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def analyse_comment_known_libs(zipfile, js_file, comment):
|
||||||
|
"""Search for library specific identifiers in comment block."""
|
||||||
libs = list()
|
libs = list()
|
||||||
data = ""
|
|
||||||
with zipfile.open(js_file) as js_file_obj:
|
|
||||||
data = js_file_obj.read()
|
|
||||||
for unkregex in unknown_lib_identifiers():
|
for unkregex in unknown_lib_identifiers():
|
||||||
unkown_lib_matched = unkregex.finditer(data)
|
unkown_lib_matched = unkregex.finditer(comment.content)
|
||||||
for match in unkown_lib_matched:
|
for match in unkown_lib_matched:
|
||||||
js_info = init_jsinfo(zipfile, js_file)
|
js_info = init_jsinfo(zipfile, js_file)
|
||||||
js_info['lib'] = ((js_file.filename).replace(
|
js_info['lib'] = ((js_file.filename).replace(
|
||||||
'.js', '')).replace('.min', '')
|
'.js', '')).replace('.min', '')
|
||||||
js_info['ver'] = match.group(2).decode()
|
js_info['ver'] = match.group(2)
|
||||||
js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT
|
js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT
|
||||||
js_info['type'] = FileClassification.LIKELY_LIBRARY
|
js_info['type'] = FileClassification.LIKELY_LIBRARY
|
||||||
libs.append(js_info)
|
libs.append(js_info)
|
||||||
return libs
|
return libs
|
||||||
|
|
||||||
def analyse_filename(zipfile, js_file):
|
def analyse_comment_generic_libs(zipfile, js_file, comment):
|
||||||
"""Check for file name patterns of libraries (known and generic as fall back)`"""
|
"""Search for generic identifiers in comment block."""
|
||||||
res = analyse_known_filename(zipfile, js_file)
|
libs = list()
|
||||||
if not res:
|
for unkregex in unknown_lib_identifiers():
|
||||||
res = analyse_generic_filecontent(zipfile, js_file)
|
unkown_lib_matched = unkregex.finditer(comment.content)
|
||||||
return res
|
for match in unkown_lib_matched:
|
||||||
|
js_info = init_jsinfo(zipfile, js_file)
|
||||||
|
js_info['lib'] = ((js_file.filename).replace(
|
||||||
|
'.js', '')).replace('.min', '')
|
||||||
|
js_info['ver'] = match.group(2)
|
||||||
|
js_info['detectMethod'] = DetectionType.FILENAME_FILECONTENT
|
||||||
|
js_info['type'] = FileClassification.LIKELY_LIBRARY
|
||||||
|
libs.append(js_info)
|
||||||
|
return libs
|
||||||
|
|
||||||
|
def analyse_comment_blocks(zipfile, js_file):
|
||||||
|
"""Search for library identifiers in comment."""
|
||||||
|
libs = list()
|
||||||
|
with zipfile.open(js_file) as js_file_obj:
|
||||||
|
with io.TextIOWrapper(js_file_obj, 'utf-8') as js_text_file_obj:
|
||||||
|
for block in mince_js(js_text_file_obj, single_line_comments_block=True):
|
||||||
|
block_libs = list()
|
||||||
|
if block.is_comment():
|
||||||
|
block_libs = analyse_comment_known_libs(zipfile, js_file, block)
|
||||||
|
if block_libs is None:
|
||||||
|
block_libs = analyse_comment_generic_libs(zipfile, js_file, block)
|
||||||
|
libs += block_libs
|
||||||
|
return libs
|
||||||
|
|
||||||
def decompose_js(zipfile):
|
def decompose_js(zipfile):
|
||||||
"""JavaScript decomposition analysis for extensions."""
|
"""JavaScript decomposition analysis for extensions."""
|
||||||
|
@ -196,11 +203,8 @@ def decompose_js(zipfile):
|
||||||
|
|
||||||
js_inventory = []
|
js_inventory = []
|
||||||
for js_file in list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist())):
|
for js_file in list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist())):
|
||||||
|
|
||||||
js_info_file = analyse_filename(zipfile, js_file)
|
js_info_file = analyse_filename(zipfile, js_file)
|
||||||
|
js_info_file += analyse_comment_blocks(zipfile, js_file)
|
||||||
js_info_file += analyse_generic_filecontent(zipfile, js_file)
|
|
||||||
js_info_file += analyse_known_filecontent(zipfile, js_file)
|
|
||||||
|
|
||||||
if not js_info_file:
|
if not js_info_file:
|
||||||
# if no library could be detected, we report the JavaScript file as 'application'.
|
# if no library could be detected, we report the JavaScript file as 'application'.
|
||||||
|
|
Loading…
Reference in New Issue