ExtensionCrawler/ExtensionCrawler/jsdecompose.py

219 lines
11 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Python analys providing a decomposition analysis of JavaScript code in
general and Chrome extensions in particular."""
import os
import re
import json
from enum import Enum
import hashlib
2017-08-23 18:17:35 +00:00
class JsDecompose:
##Class variables- whose values are shared among 'all' instances of this 'class'
regexFile = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '../resources/',
'js_identifier.json') ##full path
with open(regexFile, 'r') as fObject:
#read the whole file content as a string
jString = fObject.read()
libraryIdentifiers = json.loads(jString)
unknownFilenameIdentifier = re.compile(
2017-08-23 16:36:41 +00:00
r'(.+)[\-\_]([0-9]{1,2}[\.|\-|\_][0-9a-z]{1,2}[\.|\-|\_][0-9a-z\-\_]*)',
re.IGNORECASE)
#this will be used, when no known library is found
unknownLibraryIdentifier = [
re.compile(
2017-08-23 16:36:41 +00:00
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
re.compile(
2017-08-23 16:36:41 +00:00
rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
re.compile(
2017-08-23 16:36:41 +00:00
rb'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
re.IGNORECASE
), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
re.compile(
2017-08-23 16:36:41 +00:00
rb'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
re.IGNORECASE),
2017-08-23 16:36:41 +00:00
re.compile(rb'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
re.IGNORECASE)
]
#if even the generic regex is not matched, check if the line contains the string "version"
2017-08-23 16:36:41 +00:00
#versionOnlyIdentifier =
DetectionType = Enum("DetectionType",
'FILENAME FILECONTENT FILENAME_FILECONTENT URL HASH')
#Class constructor initialiser - is called when a new instance of this class is created
2017-08-23 18:17:35 +00:00
def __init__(self):
self.identifiedKnLibrariesList = []
self.identifiedUnknownLibrariesDict = {}
self.identifiedUnLibrariesList = []
self.identifiedApplicationsList = []
2017-08-23 18:17:35 +00:00
def detectLibraries(self, zipfile):
jsFiles = list(filter(lambda x: x.filename.endswith(".js"),
zipfile.infolist()))
#for each jsFile path in the list
2017-08-23 18:17:35 +00:00
for jsFile in list(jsFiles):
#check whether the file is empty
isApplication = True
2017-08-23 18:17:35 +00:00
with zipfile.open(jsFile) as fObject:
data = fObject.read()
md5 = hashlib.md5(data).hexdigest()
2017-08-23 18:17:35 +00:00
libraryIdentified = False
#iterate over the library regexes, to check whether it has a match
2017-08-23 18:17:35 +00:00
for lib, regex in JsDecompose.libraryIdentifiers.items():
##METHOD_1: Read the filename of this file
#if it matches to one of the defined filename regex, store in the dict
#check if there is a filename regex exists for this lib
2017-08-23 16:36:41 +00:00
if 'filename' in regex:
filenameMatched = re.search(regex['filename'],
2017-08-23 18:17:35 +00:00
jsFile.filename,
re.IGNORECASE)
2017-08-23 16:36:41 +00:00
if filenameMatched:
#check whether this lib has already been identified in the dict, otherwise store the libname and version from the filename
ver = filenameMatched.group(2)
self.identifiedKnLibrariesList.append({
'lib': lib,
'ver': ver,
'detectMethod':
self.DetectionType.FILENAME.name,
'jsFilename': os.path.basename(jsFile),
'md5': md5,
'size': int(jsFile.file_size),
2017-08-23 18:17:35 +00:00
'path': jsFile.filename
})
libraryIdentified = True
isApplication = False
##METHOD_2: Check content of every .js file
#check if there is filecontent regex exists for this lib
2017-08-23 16:36:41 +00:00
if 'filecontent' in regex:
#iterate over the filecontent regexes for this lib to see if it has a match
for aFilecontent in regex['filecontent']:
libraryMatched = re.search(aFilecontent.encode(),
data, re.IGNORECASE)
if (libraryMatched):
ver = libraryMatched.group(2).decode()
if (not self.isLibExistInList(
lib, ver,
self.identifiedKnLibrariesList)):
#to be safe, check if the version in the filename, matches with the filecontent
self.identifiedKnLibrariesList.append({
'lib': lib,
'ver': ver,
'detectMethod':
self.DetectionType.FILECONTENT.name,
'jsFilename': os.path.basename(jsFile),
'md5': md5,
'size': int(jsFile.file_size),
2017-08-23 18:17:35 +00:00
'path': jsFile.filename
})
libraryIdentified = True
isApplication = False
break
#do not need to check the other regex for this library - since its already found
#if none of the regexes in the repository match, check whether the unknown regexes match
2017-08-23 16:36:41 +00:00
if not libraryIdentified:
#check the filename
unkFilenameMatch = self.unknownFilenameIdentifier.search(
2017-08-23 18:17:35 +00:00
jsFile.filename)
2017-08-23 16:36:41 +00:00
if unkFilenameMatch:
self.identifiedUnLibrariesList.append({
'lib': unkFilenameMatch.group(1),
'ver': unkFilenameMatch.group(2),
'detectMethod': self.DetectionType.FILENAME.name,
2017-08-23 18:17:35 +00:00
'type': "library",
'jsFilename': os.path.basename(jsFile.filename),
'md5': md5,
'size': int(jsFile.file_size),
2017-08-23 18:17:35 +00:00
'path': jsFile.filename
})
isApplication = False
continue
#do not need to check the filecontent
#otherwise check the filecontent
2017-08-23 18:17:35 +00:00
for unkregex in JsDecompose.unknownLibraryIdentifier:
#print("Analysing for regex: {}".format(unkregex))
unknownLibraryMatched = unkregex.search(data)
2017-08-23 16:36:41 +00:00
if unknownLibraryMatched:
#check whether this library is actually unknown, by comparing it with identified dicts
#unkLib = unknownLibraryMatched.group(1).lower().decode()
unkVer = unknownLibraryMatched.group(2).decode()
2017-08-23 18:17:35 +00:00
unkjsFile = ((jsFile.filename).replace(
'.js', '')).replace('.min', '')
if (not self.isLibExistInList(
unkjsFile, unkVer,
self.identifiedKnLibrariesList)):
#put this unknown library in the unknown dictionary. use the filename instead - safer
self.identifiedUnLibrariesList.append({
'lib': unkjsFile,
'ver': unkVer,
'detectMethod': self.DetectionType.
2017-08-23 16:36:41 +00:00
FILENAME_FILECONTENT.name,
2017-08-23 18:17:35 +00:00
'type': "likely_library",
'jsFilename': os.path.basename(jsFile.filename),
'md5': md5,
'size': int(jsFile.file_size),
2017-08-23 18:17:35 +00:00
'path': jsFile.filename
})
isApplication = False
break
#do not need to check the rest of the unknown regexes
2017-08-23 16:36:41 +00:00
#if none of the above regexes match, then it is likely an application
if isApplication:
2017-08-23 18:17:35 +00:00
self.identifiedApplicationsList.append({
'lib': None,
'ver': None,
'detectMethod': None,
'type': "application",
'jsFilename': os.path.basename(jsFile.filename),
'md5': md5,
'size': int(jsFile.file_size),
2017-08-23 18:17:35 +00:00
'path': jsFile.filename
})
2017-08-23 18:17:35 +00:00
return (self.identifiedKnLibrariesList + self.identifiedUnLibrariesList +
self.identifiedApplicationsList)
def isLibExistInList(self, lib, ver, listOfDict):
for item in listOfDict:
if (item['lib'].lower() == lib.lower() and
item['ver'].lower() == ver.lower()):
return True
2017-08-23 18:17:35 +00:00
return False