ExtensionCrawler/ExtensionCrawler/js_decomposer.py

372 lines
14 KiB
Python
Raw Normal View History

2017-09-01 13:12:05 +00:00
#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Python analys providing a decomposition analysis of JavaScript code in
general and Chrome extensions in particular."""
import os
import io
import re
import json
from enum import Enum
from ExtensionCrawler.js_mincer import mince_js
from ExtensionCrawler.file_identifiers import get_file_identifiers
class DetectionType(Enum):
"""Enumeration for detection types."""
# EMPTY_FILE
FILE_SIZE = "file_size"
# LIBRARY
SHA1 = "sha1"
MD5 = "md5"
SHA1_DECOMPRESSED = "sha1 (after decompression)"
MD5_DECOMPRESSED = "md5 (after decompression)"
SHA1_NORMALIZED = "sha1 (after normalization)"
MD5_NORMALIZED = "md5 (after normalization)"
SHA1_DECOMPRESSED_NORMALIZED = "sha1 (after decompression and normalization)"
MD5_DECOMPRESSED_NORMALIZED = "md5 (after decompression and normalization)"
# VERY_LIKELY_LIBRARY
FILENAME_COMMENTBLOCK = "filename and witness in comment block"
FILENAME_CODEBLOCK = "filename and witness in code block"
# LIKELY_LIBRARY
COMMENTBLOCK = "witness in comment block"
CODEBLOCK = "witness in code block"
FILENAME = "known file name"
URL = "known URL"
# LIKELY_APPLICATION
DEFAULT = "default"
class FileClassification(Enum):
""" Enumeration for file classification"""
EMPTY_FILE = "other (empty file)"
METADATA = "metadata"
LIBRARY = "known library"
VERY_LIKELY_LIBRARY = "very likely known library"
LIKELY_LIBRARY = "likely known library"
LIKELY_APPLICATION = "likely application"
ERROR = "error"
2017-08-28 18:20:50 +00:00
def load_lib_identifiers():
2017-08-23 22:37:15 +00:00
"""Initialize identifiers for known libraries from JSON file."""
regex_file = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '../resources/',
2017-08-23 22:37:15 +00:00
'js_identifier.json')
with open(regex_file, 'r') as json_file:
json_content = json_file.read()
return json.loads(json_content)
2017-08-23 19:04:52 +00:00
2017-08-23 23:44:34 +00:00
2017-08-23 22:37:15 +00:00
def unknown_filename_identifier():
"""Identifier for extracting version information from unknown/generic file names."""
2017-08-23 19:04:52 +00:00
return re.compile(
2017-08-23 19:09:02 +00:00
r'(.+)[\-\_]([0-9]{1,2}[\.|\-|\_][0-9a-z]{1,2}[\.|\-|\_][0-9a-z\-\_]*)',
re.IGNORECASE)
2017-08-23 22:37:15 +00:00
def unknown_lib_identifiers():
"""List of identifiers for generic library version headers."""
2017-08-23 19:09:02 +00:00
return ([
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
2017-08-23 19:09:02 +00:00
re.IGNORECASE
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
2017-08-23 19:09:02 +00:00
re.IGNORECASE
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
re.compile(
r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
2017-08-23 19:09:02 +00:00
re.IGNORECASE
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
re.compile(
r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
2017-08-23 19:09:02 +00:00
re.IGNORECASE),
re.compile(r'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
2017-08-23 19:09:02 +00:00
re.IGNORECASE)
])
def init_file_info(path, data):
2017-08-24 18:43:48 +00:00
"""Initialize jsinfo record."""
file_info = get_file_identifiers(path, data)
file_info['lib'] = None
file_info['version'] = None
file_info['detectionMethod'] = None
file_info['detectionMethodDetails'] = None
file_info['type'] = None
file_info['evidenceStartPos'] = None
file_info['evidenceEndPos'] = None
file_info['evidenceText'] = None
return file_info
def check_empty_file(file_info):
"""Check if file is empty."""
if file_info['size'] == 0:
file_info['detectionMethod'] = DetectionType.FILE_SIZE
file_info['type'] = FileClassification.EMPTY_FILE
return file_info
def check_metadata(file_info):
"""Check for metadata (based on filename/path)."""
if file_info['path'] == "manifest.json" or file_info['path'] == "_metadata/verified_contents.json":
file_info['detectionMethod'] = DetectionType.FILENAME
file_info['type'] = FileClassification.METADATA
return file_info
def check_sha1(file_info):
"""Check for known sha1 hash (file content)."""
# TODO
return file_info
def check_sha1_decompressed(file_info):
"""Check for known sha1 hash (decompressed file content)."""
# TODO
return file_info
def check_sha1_normalized(file_info):
"""Check for known sha1 hash (normalized file content)."""
# TODO
return file_info
def check_sha1_decompressed_normalized(file_info):
"""Check for known sha1 hash (decompressed normalized file content)."""
# TODO
return file_info
def check_filename(file_info):
2017-09-19 08:16:32 +00:00
"""Check for known filename and typical library filename patterns."""
# TODO
return file_info
def check_comment_blocks(file_info, data):
"""Check for known pattern in comment blocks."""
# TODO
2017-09-19 08:16:32 +00:00
return [file_info]
def check_code_blocks(file_info, data):
"""Check for known pattern in code blocks."""
# TODO
2017-09-19 08:16:32 +00:00
return [file_info]
2017-08-24 18:43:48 +00:00
def analyse_checksum(zipfile, js_file, js_info):
2017-08-28 19:09:34 +00:00
"""Check for known md5 hashes (file content)."""
json_data = load_lib_identifiers()
for lib in json_data:
for info in json_data[lib]:
if info == 'sha1':
for lib_file in json_data[lib]['sha1']:
if lib_file['sha1'].lower() == js_info['sha1'].hex():
js_info['lib'] = lib
js_info['version'] = lib_file['version']
js_info['type'] = FileClassification.LIBRARY
2017-08-31 07:43:19 +00:00
js_info['detectionMethod'] = DetectionType.SHA1,
if 'comment' in lib_file:
js_info['detectionMethodDetails'] = lib_file['comment']
return [js_info]
2017-08-28 19:09:34 +00:00
if info == 'md5':
for lib_file in json_data[lib]['md5']:
if lib_file['md5'].lower() == js_info['md5'].hex():
2017-08-28 19:09:34 +00:00
js_info['lib'] = lib
js_info['version'] = lib_file['version']
2017-08-28 19:09:34 +00:00
js_info['type'] = FileClassification.LIBRARY
2017-08-30 07:28:39 +00:00
js_info['detectionMethod'] = DetectionType.MD5
if 'comment' in lib_file:
js_info['detectionMethodDetails'] = lib_file['comment']
2017-08-28 19:09:34 +00:00
return [js_info]
return None
2017-08-24 18:43:48 +00:00
def analyse_known_filename(zipfile, js_file, js_info):
2017-08-24 18:43:48 +00:00
"""Check for known file name patterns."""
libs = list()
2017-08-28 18:20:50 +00:00
for lib, regex in load_lib_identifiers().items():
2017-08-24 18:43:48 +00:00
if 'filename' in regex:
if zipfile is not None:
filename = js_file.filename
else:
filename = js_file
2017-08-24 18:43:48 +00:00
filename_matched = re.search(regex['filename'],
filename, re.IGNORECASE)
2017-08-24 18:43:48 +00:00
if filename_matched:
js_info['lib'] = lib
2017-08-30 07:28:39 +00:00
js_info['version'] = filename_matched.group(2)
js_info['type'] = FileClassification.LIBRARY
2017-08-30 07:28:39 +00:00
js_info['detectionMethod'] = DetectionType.FILENAME
2017-08-31 22:44:51 +00:00
js_info['detectionMethodDetails'] = regex['filename']
2017-08-24 18:43:48 +00:00
libs.append(js_info)
return libs
def analyse_generic_filename(zipfile, js_file, js_info):
2017-08-24 18:43:48 +00:00
"""Check for generic file name patterns."""
libs = list()
if zipfile is not None:
filename = js_file.filename
else:
filename = js_file
2017-08-24 18:43:48 +00:00
unknown_filename_match = unknown_filename_identifier().search(
filename)
2017-08-24 18:43:48 +00:00
if unknown_filename_match:
2017-09-01 23:05:07 +00:00
js_info['lib'] = os.path.basename(unknown_filename_match.group(1)).replace(
'.js', '').replace('.min', '')
2017-08-30 07:28:39 +00:00
js_info['version'] = unknown_filename_match.group(2)
js_info['type'] = FileClassification.LIKELY_LIBRARY
2017-08-30 07:28:39 +00:00
js_info['detectionMethod'] = DetectionType.FILENAME
2017-08-24 18:43:48 +00:00
libs.append(js_info)
return libs
def analyse_filename(zipfile, js_file, js_info):
"""Check for file name patterns of libraries (known and generic as fall back)`"""
res = analyse_known_filename(zipfile, js_file, js_info)
if not res:
res = analyse_generic_filename(zipfile, js_file, js_info)
return res
def analyse_comment_known_libs(zipfile, js_file, js_info, comment):
"""Search for library specific identifiers in comment block."""
2017-08-24 18:43:48 +00:00
libs = list()
if zipfile is not None:
2017-09-01 22:24:55 +00:00
filename = js_file.filename
else:
filename = js_file
for lib, regex in load_lib_identifiers().items():
if('filecontent' in regex):
for unkregex in regex['filecontent']:
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
2017-09-01 23:05:07 +00:00
js_info['lib'] = lib
js_info['version'] = match.group(2)
js_info['detectionMethod'] = DetectionType.COMMENTBLOCK
js_info['detectionMethodDetails'] = unkregex
js_info['type'] = FileClassification.LIBRARY
libs.append(js_info)
2017-08-24 18:43:48 +00:00
return libs
def analyse_comment_generic_libs(zipfile, js_file, js_info, comment):
"""Search for generic identifiers in comment block."""
libs = list()
if zipfile is not None:
2017-09-01 22:24:55 +00:00
filename = js_file.filename
else:
filename = js_file
2017-08-30 10:42:12 +00:00
for unkregex in unknown_lib_identifiers():
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
js_info['lib'] = ((os.path.basename(filename)).replace(
'.js', '')).replace('.min', '')
2017-08-30 07:28:39 +00:00
js_info['version'] = match.group(2)
js_info['detectionMethod'] = DetectionType.COMMENTBLOCK
2017-08-31 22:44:51 +00:00
js_info['detectionMethodDetails'] = unkregex
js_info['type'] = FileClassification.LIKELY_LIBRARY
libs.append(js_info)
return libs
def analyse_comment_blocks(zipfile, js_file, js_info):
"""Search for library identifiers in comment."""
def mince_js_fileobj(js_text_file_obj):
"""Mince JavaScript file using a file object."""
libs = list()
for block in mince_js(js_text_file_obj, single_line_comments_block=True):
block_libs = list()
if block.is_comment():
block_libs = analyse_comment_known_libs(zipfile, js_file, js_info, block)
if block_libs is None:
block_libs = analyse_comment_generic_libs(zipfile, js_file, js_info, block)
if block_libs is not None:
libs += block_libs
return libs
libs = []
try:
if zipfile is not None:
with zipfile.open(js_file) as js_file_obj:
with io.TextIOWrapper(js_file_obj, js_info['encoding']) as js_text_file_obj:
2017-09-01 22:24:55 +00:00
libs = mince_js_fileobj(js_text_file_obj)
else:
with open(js_file) as js_text_file_obj:
2017-09-01 22:24:55 +00:00
libs = mince_js_fileobj(js_text_file_obj)
except:
libs = list()
return libs
2017-08-24 18:43:48 +00:00
def decompose_js(path_or_zipfileobj):
2017-08-24 18:43:48 +00:00
"""JavaScript decomposition analysis for extensions."""
zipfile = None
inventory = []
if isinstance(path_or_zipfileobj, str):
path_list = [path_or_zipfileobj]
else:
zipfile = path_or_zipfileobj
path_list = list(filter(lambda x: os.path.basename(x.filename) != "", zipfile.infolist()))
for path_or_zipentry in path_list:
if zipfile is not None:
with zipfile.open(path_or_zipentry) as js_file_obj:
data = js_file_obj.read()
path = path_or_zipentry.filename
2017-08-24 18:43:48 +00:00
else:
with open(path_or_zipentry, mode='rb') as js_file_obj:
data = js_file_obj.read()
path = path_or_zipentry
file_info = init_file_info(path, data)
file_info = check_empty_file(file_info)
if not file_info['detectionMethod'] is None:
inventory.append(file_info)
continue
file_info = check_metadata(file_info)
if not file_info['detectionMethod'] is None:
inventory.append(file_info)
continue
file_info = check_sha1(file_info)
if not file_info['detectionMethod'] is None:
inventory.append(file_info)
continue
file_info = check_sha1_decompressed(file_info)
if not file_info['detectionMethod'] is None:
inventory.append(file_info)
continue
file_info = check_sha1_normalized(file_info)
if not file_info['detectionMethod'] is None:
inventory.append(file_info)
continue
file_info = check_sha1_decompressed_normalized(file_info)
if not file_info['detectionMethod'] is None:
inventory.append(file_info)
continue
file_info = check_filename(file_info)
if not file_info['detectionMethod'] is None:
# TODO
2017-09-19 08:16:32 +00:00
js_info_comments = check_comment_blocks(file_info, data)
js_info_codes = check_code_blocks(file_info, data)
# js_info_file = analyse_checksum(zipfile, js_file, js_info)
# if not js_info_file:
# js_info_file = analyse_filename(zipfile, js_file, js_info)
# js_info_file += analyse_comment_blocks(zipfile, js_file, js_info)
inventory.append(file_info)
continue
# if no library could be detected, we report the JavaScript file as 'application'.
file_info['lib'] = None
file_info['version'] = None
file_info['detectionMethod'] = DetectionType.DEFAULT
file_info['type'] = FileClassification.LIKELY_APPLICATION
inventory.append(file_info)
2017-09-19 08:16:32 +00:00
return inventory