2017-08-23 16:22:58 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
#
|
|
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
"""Python analys providing a decomposition analysis of JavaScript code in
|
|
|
|
general and Chrome extensions in particular."""
|
|
|
|
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import json
|
|
|
|
from enum import Enum
|
2017-08-23 18:42:00 +00:00
|
|
|
import hashlib
|
2017-08-23 16:22:58 +00:00
|
|
|
|
|
|
|
|
2017-08-23 18:17:35 +00:00
|
|
|
class JsDecompose:
|
2017-08-23 16:22:58 +00:00
|
|
|
##Class variables- whose values are shared among 'all' instances of this 'class'
|
|
|
|
regexFile = os.path.join(
|
|
|
|
os.path.dirname(os.path.realpath(__file__)), '../resources/',
|
|
|
|
'js_identifier.json') ##full path
|
|
|
|
|
|
|
|
with open(regexFile, 'r') as fObject:
|
|
|
|
#read the whole file content as a string
|
|
|
|
jString = fObject.read()
|
|
|
|
|
|
|
|
libraryIdentifiers = json.loads(jString)
|
|
|
|
|
|
|
|
unknownFilenameIdentifier = re.compile(
|
2017-08-23 16:36:41 +00:00
|
|
|
r'(.+)[\-\_]([0-9]{1,2}[\.|\-|\_][0-9a-z]{1,2}[\.|\-|\_][0-9a-z\-\_]*)',
|
2017-08-23 16:22:58 +00:00
|
|
|
re.IGNORECASE)
|
|
|
|
|
|
|
|
#this will be used, when no known library is found
|
|
|
|
unknownLibraryIdentifier = [
|
|
|
|
re.compile(
|
2017-08-23 16:36:41 +00:00
|
|
|
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
2017-08-23 16:22:58 +00:00
|
|
|
re.IGNORECASE
|
|
|
|
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
|
|
|
|
re.compile(
|
2017-08-23 16:36:41 +00:00
|
|
|
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
|
2017-08-23 16:22:58 +00:00
|
|
|
re.IGNORECASE
|
|
|
|
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
|
|
|
|
re.compile(
|
2017-08-23 16:36:41 +00:00
|
|
|
rb'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
|
2017-08-23 16:22:58 +00:00
|
|
|
re.IGNORECASE
|
|
|
|
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
|
|
|
|
re.compile(
|
2017-08-23 16:36:41 +00:00
|
|
|
rb'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
|
2017-08-23 16:22:58 +00:00
|
|
|
re.IGNORECASE),
|
2017-08-23 16:36:41 +00:00
|
|
|
re.compile(rb'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
|
2017-08-23 16:22:58 +00:00
|
|
|
re.IGNORECASE)
|
|
|
|
]
|
|
|
|
|
|
|
|
#if even the generic regex is not matched, check if the line contains the string "version"
|
2017-08-23 16:36:41 +00:00
|
|
|
#versionOnlyIdentifier =
|
2017-08-23 16:22:58 +00:00
|
|
|
|
|
|
|
DetectionType = Enum("DetectionType",
|
|
|
|
'FILENAME FILECONTENT FILENAME_FILECONTENT URL HASH')
|
|
|
|
|
|
|
|
#Class constructor initialiser - is called when a new instance of this class is created
|
2017-08-23 18:17:35 +00:00
|
|
|
def __init__(self):
|
2017-08-23 16:22:58 +00:00
|
|
|
self.identifiedKnLibrariesList = []
|
|
|
|
self.identifiedUnknownLibrariesDict = {}
|
|
|
|
self.identifiedUnLibrariesList = []
|
|
|
|
self.identifiedApplicationsList = []
|
2017-08-23 18:17:35 +00:00
|
|
|
|
2017-08-23 18:42:00 +00:00
|
|
|
def detectLibraries(self, zipfile):
|
|
|
|
jsFiles = list(filter(lambda x: x.filename.endswith(".js"),
|
|
|
|
zipfile.infolist()))
|
|
|
|
|
2017-08-23 16:22:58 +00:00
|
|
|
#for each jsFile path in the list
|
2017-08-23 18:17:35 +00:00
|
|
|
for jsFile in list(jsFiles):
|
2017-08-23 16:22:58 +00:00
|
|
|
#check whether the file is empty
|
|
|
|
|
|
|
|
isApplication = True
|
2017-08-23 18:17:35 +00:00
|
|
|
with zipfile.open(jsFile) as fObject:
|
|
|
|
data = fObject.read()
|
2017-08-23 18:42:00 +00:00
|
|
|
md5 = hashlib.md5(data).hexdigest()
|
2017-08-23 18:17:35 +00:00
|
|
|
|
2017-08-23 16:22:58 +00:00
|
|
|
libraryIdentified = False
|
|
|
|
|
|
|
|
#iterate over the library regexes, to check whether it has a match
|
2017-08-23 18:17:35 +00:00
|
|
|
for lib, regex in JsDecompose.libraryIdentifiers.items():
|
2017-08-23 16:22:58 +00:00
|
|
|
##METHOD_1: Read the filename of this file
|
|
|
|
#if it matches to one of the defined filename regex, store in the dict
|
|
|
|
#check if there is a filename regex exists for this lib
|
2017-08-23 16:36:41 +00:00
|
|
|
if 'filename' in regex:
|
2017-08-23 16:22:58 +00:00
|
|
|
filenameMatched = re.search(regex['filename'],
|
2017-08-23 18:17:35 +00:00
|
|
|
jsFile.filename,
|
2017-08-23 16:22:58 +00:00
|
|
|
re.IGNORECASE)
|
|
|
|
|
2017-08-23 16:36:41 +00:00
|
|
|
if filenameMatched:
|
2017-08-23 16:22:58 +00:00
|
|
|
#check whether this lib has already been identified in the dict, otherwise store the libname and version from the filename
|
|
|
|
ver = filenameMatched.group(2)
|
|
|
|
self.identifiedKnLibrariesList.append({
|
|
|
|
'lib': lib,
|
|
|
|
'ver': ver,
|
|
|
|
'detectMethod':
|
|
|
|
self.DetectionType.FILENAME.name,
|
|
|
|
'jsFilename': os.path.basename(jsFile),
|
2017-08-23 18:42:00 +00:00
|
|
|
'md5': md5,
|
|
|
|
'size': int(jsFile.file_size),
|
2017-08-23 18:17:35 +00:00
|
|
|
'path': jsFile.filename
|
2017-08-23 16:22:58 +00:00
|
|
|
})
|
|
|
|
libraryIdentified = True
|
|
|
|
isApplication = False
|
|
|
|
|
|
|
|
##METHOD_2: Check content of every .js file
|
|
|
|
#check if there is filecontent regex exists for this lib
|
2017-08-23 16:36:41 +00:00
|
|
|
if 'filecontent' in regex:
|
2017-08-23 16:22:58 +00:00
|
|
|
#iterate over the filecontent regexes for this lib to see if it has a match
|
|
|
|
for aFilecontent in regex['filecontent']:
|
|
|
|
libraryMatched = re.search(aFilecontent.encode(),
|
|
|
|
data, re.IGNORECASE)
|
|
|
|
if (libraryMatched):
|
|
|
|
ver = libraryMatched.group(2).decode()
|
|
|
|
if (not self.isLibExistInList(
|
|
|
|
lib, ver,
|
|
|
|
self.identifiedKnLibrariesList)):
|
|
|
|
#to be safe, check if the version in the filename, matches with the filecontent
|
|
|
|
self.identifiedKnLibrariesList.append({
|
|
|
|
'lib': lib,
|
|
|
|
'ver': ver,
|
|
|
|
'detectMethod':
|
|
|
|
self.DetectionType.FILECONTENT.name,
|
|
|
|
'jsFilename': os.path.basename(jsFile),
|
2017-08-23 18:42:00 +00:00
|
|
|
'md5': md5,
|
|
|
|
'size': int(jsFile.file_size),
|
2017-08-23 18:17:35 +00:00
|
|
|
'path': jsFile.filename
|
2017-08-23 16:22:58 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
libraryIdentified = True
|
|
|
|
isApplication = False
|
|
|
|
break
|
|
|
|
#do not need to check the other regex for this library - since its already found
|
|
|
|
|
|
|
|
#if none of the regexes in the repository match, check whether the unknown regexes match
|
2017-08-23 16:36:41 +00:00
|
|
|
if not libraryIdentified:
|
2017-08-23 16:22:58 +00:00
|
|
|
#check the filename
|
|
|
|
unkFilenameMatch = self.unknownFilenameIdentifier.search(
|
2017-08-23 18:17:35 +00:00
|
|
|
jsFile.filename)
|
2017-08-23 16:36:41 +00:00
|
|
|
if unkFilenameMatch:
|
2017-08-23 16:22:58 +00:00
|
|
|
self.identifiedUnLibrariesList.append({
|
|
|
|
'lib': unkFilenameMatch.group(1),
|
|
|
|
'ver': unkFilenameMatch.group(2),
|
|
|
|
'detectMethod': self.DetectionType.FILENAME.name,
|
2017-08-23 18:17:35 +00:00
|
|
|
'type': "library",
|
|
|
|
'jsFilename': os.path.basename(jsFile.filename),
|
2017-08-23 18:42:00 +00:00
|
|
|
'md5': md5,
|
|
|
|
'size': int(jsFile.file_size),
|
2017-08-23 18:17:35 +00:00
|
|
|
'path': jsFile.filename
|
2017-08-23 16:22:58 +00:00
|
|
|
})
|
|
|
|
isApplication = False
|
|
|
|
continue
|
|
|
|
#do not need to check the filecontent
|
|
|
|
|
|
|
|
#otherwise check the filecontent
|
2017-08-23 18:17:35 +00:00
|
|
|
for unkregex in JsDecompose.unknownLibraryIdentifier:
|
2017-08-23 16:22:58 +00:00
|
|
|
#print("Analysing for regex: {}".format(unkregex))
|
|
|
|
unknownLibraryMatched = unkregex.search(data)
|
2017-08-23 16:36:41 +00:00
|
|
|
if unknownLibraryMatched:
|
2017-08-23 16:22:58 +00:00
|
|
|
#check whether this library is actually unknown, by comparing it with identified dicts
|
|
|
|
#unkLib = unknownLibraryMatched.group(1).lower().decode()
|
|
|
|
unkVer = unknownLibraryMatched.group(2).decode()
|
2017-08-23 18:17:35 +00:00
|
|
|
unkjsFile = ((jsFile.filename).replace(
|
2017-08-23 16:22:58 +00:00
|
|
|
'.js', '')).replace('.min', '')
|
|
|
|
|
|
|
|
if (not self.isLibExistInList(
|
|
|
|
unkjsFile, unkVer,
|
|
|
|
self.identifiedKnLibrariesList)):
|
|
|
|
#put this unknown library in the unknown dictionary. use the filename instead - safer
|
|
|
|
self.identifiedUnLibrariesList.append({
|
|
|
|
'lib': unkjsFile,
|
|
|
|
'ver': unkVer,
|
|
|
|
'detectMethod': self.DetectionType.
|
2017-08-23 16:36:41 +00:00
|
|
|
FILENAME_FILECONTENT.name,
|
2017-08-23 18:17:35 +00:00
|
|
|
'type': "likely_library",
|
|
|
|
'jsFilename': os.path.basename(jsFile.filename),
|
2017-08-23 18:42:00 +00:00
|
|
|
'md5': md5,
|
|
|
|
'size': int(jsFile.file_size),
|
2017-08-23 18:17:35 +00:00
|
|
|
'path': jsFile.filename
|
2017-08-23 16:22:58 +00:00
|
|
|
})
|
|
|
|
isApplication = False
|
|
|
|
break
|
|
|
|
#do not need to check the rest of the unknown regexes
|
|
|
|
|
2017-08-23 16:36:41 +00:00
|
|
|
#if none of the above regexes match, then it is likely an application
|
|
|
|
if isApplication:
|
2017-08-23 18:17:35 +00:00
|
|
|
self.identifiedApplicationsList.append({
|
|
|
|
'lib': None,
|
|
|
|
'ver': None,
|
|
|
|
'detectMethod': None,
|
|
|
|
'type': "application",
|
|
|
|
'jsFilename': os.path.basename(jsFile.filename),
|
2017-08-23 18:42:00 +00:00
|
|
|
'md5': md5,
|
|
|
|
'size': int(jsFile.file_size),
|
2017-08-23 18:17:35 +00:00
|
|
|
'path': jsFile.filename
|
|
|
|
})
|
2017-08-23 16:22:58 +00:00
|
|
|
|
2017-08-23 18:17:35 +00:00
|
|
|
return (self.identifiedKnLibrariesList + self.identifiedUnLibrariesList +
|
2017-08-23 16:22:58 +00:00
|
|
|
self.identifiedApplicationsList)
|
|
|
|
|
|
|
|
def isLibExistInList(self, lib, ver, listOfDict):
|
|
|
|
for item in listOfDict:
|
|
|
|
if (item['lib'].lower() == lib.lower() and
|
|
|
|
item['ver'].lower() == ver.lower()):
|
|
|
|
return True
|
2017-08-23 18:17:35 +00:00
|
|
|
return False
|