ExtensionCrawler/ExtensionCrawler/js_decomposer.py

300 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Python analys providing a decomposition analysis of JavaScript code in
general and Chrome extensions in particular."""
import os
import io
import re
import json
from enum import Enum
import hashlib
import cchardet as chardet
from ExtensionCrawler.js_mincer import mince_js
class DetectionType(Enum):
"""Enumeration for detection types."""
FILENAME = "filename"
COMMENTBLOCK = "comment_block"
CODEBLOCK = "code_block"
FILENAME_COMMENT = "filename_and_comment_block"
FILENAME_CODE = "filename_and_code_block"
URL = "known_url"
MD5 = "md5"
SHA1 = "sha1"
DEFAULT = "default"
class FileClassification(Enum):
""" Enumeration for file classification"""
LIBRARY = "known_library"
LIKELY_LIBRARY = "likely_library"
APPLICATION = "likely_application"
EMPTY_FILE = "empty_file"
FILE_SIZE = "file_size"
def load_lib_identifiers():
"""Initialize identifiers for known libraries from JSON file."""
regex_file = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '../resources/',
'js_identifier.json')
with open(regex_file, 'r') as json_file:
json_content = json_file.read()
return json.loads(json_content)
def unknown_filename_identifier():
"""Identifier for extracting version information from unknown/generic file names."""
return re.compile(
r'(.+)[\-\_]([0-9]{1,2}[\.|\-|\_][0-9a-z]{1,2}[\.|\-|\_][0-9a-z\-\_]*)',
re.IGNORECASE)
def unknown_lib_identifiers():
"""List of identifiers for generic library version headers."""
return ([
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
re.compile(
r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
re.compile(
r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
re.compile(
r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
re.IGNORECASE),
re.compile(r'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
re.IGNORECASE)
])
def init_jsinfo(zipfile, js_file):
"""Initialize jsinfo record."""
data = ""
if zipfile is not None:
with zipfile.open(js_file) as js_file_obj:
data = js_file_obj.read()
js_filename = os.path.basename(js_file.filename)
file_size = int(js_file.file_size)
path = js_file.filename
else:
with open(js_file, mode='rb') as js_file_obj:
data = js_file_obj.read()
js_filename = os.path.basename(js_file)
file_size = len(data)
path = js_file
js_info = {
'lib': None,
'version': None,
'detectionMethod': None,
'type': None,
'evidenceStartPos': None,
'evidenceEndPos': None,
'evidenceText': None,
'encoding': chardet.detect(data)['encoding'],
'jsFilename': js_filename,
'md5': hashlib.md5(data).digest(),
'sha1': hashlib.sha1(data).digest(),
'size': file_size,
'path': path
}
if js_info['size'] == 0:
js_info['detectionMethod'] = FileClassification.FILE_SIZE
js_info['type'] = FileClassification.EMPTY_FILE
return js_info
def analyse_checksum(zipfile, js_file, js_info):
"""Check for known md5 hashes (file content)."""
json_data = load_lib_identifiers()
for lib in json_data:
for info in json_data[lib]:
if info == 'sha1':
for file in json_data[lib]['sha1']:
if file['sha1'] == js_info['sha1']:
js_info['lib'] = lib
js_info['version'] = file['version']
js_info['type'] = FileClassification.LIBRARY
js_info['detectionMethod'] = DetectionType.SHA1
return [js_info]
if info == 'md5':
for file in json_data[lib]['md5']:
if file['md5'] == js_info['md5']:
js_info['lib'] = lib
js_info['version'] = file['version']
js_info['type'] = FileClassification.LIBRARY
js_info['detectionMethod'] = DetectionType.MD5
return [js_info]
return None
def analyse_known_filename(zipfile, js_file, js_info):
"""Check for known file name patterns."""
libs = list()
for lib, regex in load_lib_identifiers().items():
if 'filename' in regex:
if zipfile is not None:
filename = js_file.filename
else:
filename = js_file
filename_matched = re.search(regex['filename'],
filename, re.IGNORECASE)
if filename_matched:
js_info['lib'] = lib
js_info['version'] = filename_matched.group(2)
js_info['type'] = FileClassification.LIBRARY
js_info['detectionMethod'] = DetectionType.FILENAME
libs.append(js_info)
return libs
def analyse_generic_filename(zipfile, js_file, js_info):
"""Check for generic file name patterns."""
libs = list()
if zipfile is not None:
filename = js_file.filename
else:
filename = js_file
unknown_filename_match = unknown_filename_identifier().search(
filename)
if unknown_filename_match:
js_info['lib'] = unknown_filename_match.group(1)
js_info['version'] = unknown_filename_match.group(2)
js_info['type'] = FileClassification.LIKELY_LIBRARY
js_info['detectionMethod'] = DetectionType.FILENAME
libs.append(js_info)
return libs
def analyse_filename(zipfile, js_file, js_info):
"""Check for file name patterns of libraries (known and generic as fall back)`"""
res = analyse_known_filename(zipfile, js_file, js_info)
if not res:
res = analyse_generic_filename(zipfile, js_file, js_info)
return res
def analyse_comment_known_libs(zipfile, js_file, js_info, comment):
"""Search for library specific identifiers in comment block."""
libs = list()
if zipfile is not None:
filename = js_file.filename
else:
filename = js_file
for unkregex in unknown_lib_identifiers():
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
js_info['lib'] = ((filename).replace(
'.js', '')).replace('.min', '')
js_info['version'] = match.group(2)
js_info['detectionMethod'] = DetectionType.COMMENTBLOCK
js_info['type'] = FileClassification.LIKELY_LIBRARY
libs.append(js_info)
return libs
def analyse_comment_generic_libs(zipfile, js_file, js_info, comment):
"""Search for generic identifiers in comment block."""
libs = list()
if zipfile is not None:
filename = js_file.filename
else:
filename = js_file
for unkregex in unknown_lib_identifiers():
unkown_lib_matched = unkregex.finditer(comment.content)
for match in unkown_lib_matched:
js_info['lib'] = ((filename).replace(
'.js', '')).replace('.min', '')
js_info['version'] = match.group(2)
js_info['detectionMethod'] = DetectionType.COMMENTBLOCK
js_info['type'] = FileClassification.LIKELY_LIBRARY
libs.append(js_info)
return libs
def analyse_comment_blocks(zipfile, js_file, js_info):
"""Search for library identifiers in comment."""
def mince_js_fileobj(js_text_file_obj):
"""Mince JavaScript file using a file object."""
libs = list()
for block in mince_js(js_text_file_obj, single_line_comments_block=True):
block_libs = list()
if block.is_comment():
block_libs = analyse_comment_known_libs(zipfile, js_file, js_info, block)
if block_libs is None:
block_libs = analyse_comment_generic_libs(zipfile, js_file, js_info, block)
libs += block_libs
return libs
try:
if zipfile is not None:
with zipfile.open(js_file) as js_file_obj:
with io.TextIOWrapper(js_file_obj, js_info['encoding']) as js_text_file_obj:
libs=mince_js_fileobj(js_text_file_obj)
else:
with open(js_file) as js_text_file_obj:
libs=mince_js_fileobj(js_text_file_obj)
except:
libs = list()
return libs
def decompose_js(file):
"""JavaScript decomposition analysis for extensions."""
def remdups(lst):
"""Remove duplicates in a list."""
res = list()
for sublist in lst:
if sublist not in res:
res.append(sublist)
return res
zipfile = None
js_inventory = []
if isinstance(file, str):
js_files = [file]
else:
zipfile = file
js_files = list(filter(lambda x: x.filename.endswith(".js"), zipfile.infolist()))
for js_file in js_files:
js_info = init_jsinfo(zipfile, js_file)
if js_info['type'] == FileClassification.EMPTY_FILE:
js_inventory.append(js_info)
else:
js_info_file = analyse_checksum(zipfile, js_file, js_info)
if not js_info_file:
js_info_file = analyse_filename(zipfile, js_file, js_info)
js_info_file += analyse_comment_blocks(zipfile, js_file, js_info)
if not js_info_file:
# if no library could be detected, we report the JavaScript file as 'application'.
js_info['lib'] = None
js_info['version'] = None
js_info['detectionMethod'] = DetectionType.DEFAULT
js_info['type'] = FileClassification.APPLICATION
js_inventory.append(js_info)
else:
js_inventory += js_info_file
return remdups(js_inventory)