ExtensionCrawler/ExtensionCrawler/jsdecompose.py

#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""Python analys providing a decomposition analysis of JavaScript code in
   general and Chrome extensions in particular."""

import os
import re
import json
from enum import Enum
import hashlib


class JsDecompose:
    ##Class variables- whose values are shared among 'all' instances of this 'class'
    regexFile = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), '../resources/',
        'js_identifier.json')  ##full path

    with open(regexFile, 'r') as fObject:
        #read the whole file content as a string
        jString = fObject.read()

    libraryIdentifiers = json.loads(jString)

    unknownFilenameIdentifier = re.compile(
        r'(.+)[\-\_]([0-9]{1,2}[\.|\-|\_][0-9a-z]{1,2}[\.|\-|\_][0-9a-z\-\_]*)',
        re.IGNORECASE)

    #this will be used, when no known library is found
    unknownLibraryIdentifier = [
        re.compile(
            rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
            re.IGNORECASE
        ),  #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
        re.compile(
            rb'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
            re.IGNORECASE
        ),  #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
        re.compile(
            rb'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
            re.IGNORECASE
        ),  #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
        re.compile(
            rb'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
            re.IGNORECASE),
        re.compile(rb'(.+) v([0-9]{1,2}[\.|\-|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',
                   re.IGNORECASE)
    ]

    #if even the generic regex is not matched, check if the line contains the string "version"
    #versionOnlyIdentifier =

    DetectionType = Enum("DetectionType",
                         'FILENAME FILECONTENT FILENAME_FILECONTENT URL HASH')

    #Class constructor initialiser - is called when a new instance of this class is created
    def __init__(self):
        self.identifiedKnLibrariesList = []
        self.identifiedUnknownLibrariesDict = {}
        self.identifiedUnLibrariesList = []
        self.identifiedApplicationsList = []
    
    def detectLibraries(self, zipfile):
        jsFiles = list(filter(lambda x: x.filename.endswith(".js"),
                        zipfile.infolist()))

        #for each jsFile path in the list
        for jsFile in list(jsFiles):
            #check whether the file is empty

            isApplication = True
            with zipfile.open(jsFile) as fObject:
                data = fObject.read()
                md5 = hashlib.md5(data).hexdigest()

                libraryIdentified = False

                #iterate over the library regexes, to check whether it has a match
                for lib, regex in JsDecompose.libraryIdentifiers.items():
                    ##METHOD_1: Read the filename of this file
                    #if it matches to one of the defined filename regex, store in the dict
                    #check if there is a filename regex exists for this lib
                    if 'filename' in regex:
                        filenameMatched = re.search(regex['filename'],
                                                    jsFile.filename,
                                                    re.IGNORECASE)

                        if filenameMatched:
                            #check whether this lib has already been identified in the dict, otherwise store the libname and version from the filename
                            ver = filenameMatched.group(2)
                            self.identifiedKnLibrariesList.append({
                                'lib': lib,
                                'ver': ver,
                                'detectMethod':
                                self.DetectionType.FILENAME.name,
                                'jsFilename': os.path.basename(jsFile),
                                'md5': md5,
                                'size': int(jsFile.file_size),
                                'path': jsFile.filename
                            })
                            libraryIdentified = True
                            isApplication = False

                    ##METHOD_2: Check content of every .js file
                    #check if there is filecontent regex exists for this lib
                    if 'filecontent' in regex:
                        #iterate over the filecontent regexes for this lib to see if it has a match
                        for aFilecontent in regex['filecontent']:
                            libraryMatched = re.search(aFilecontent.encode(),
                                                       data, re.IGNORECASE)
                            if (libraryMatched):
                                ver = libraryMatched.group(2).decode()
                                if (not self.isLibExistInList(
                                        lib, ver,
                                        self.identifiedKnLibrariesList)):
                                    #to be safe, check if the version in the filename, matches with the filecontent
                                    self.identifiedKnLibrariesList.append({
                                        'lib': lib,
                                        'ver': ver,
                                        'detectMethod':
                                        self.DetectionType.FILECONTENT.name,
                                        'jsFilename': os.path.basename(jsFile),
                                        'md5': md5,
                                        'size': int(jsFile.file_size),
                                        'path': jsFile.filename
                                    })

                                libraryIdentified = True
                                isApplication = False
                                break
                                #do not need to check the other regex for this library - since its already found

                #if none of the regexes in the repository match, check whether the unknown regexes match
                if not libraryIdentified:
                    #check the filename
                    unkFilenameMatch = self.unknownFilenameIdentifier.search(
                        jsFile.filename)
                    if unkFilenameMatch:
                        self.identifiedUnLibrariesList.append({
                            'lib': unkFilenameMatch.group(1),
                            'ver': unkFilenameMatch.group(2),
                            'detectMethod': self.DetectionType.FILENAME.name,
                            'type': "library",
                            'jsFilename': os.path.basename(jsFile.filename),
                            'md5': md5,
                            'size': int(jsFile.file_size),
                            'path': jsFile.filename
                        })
                        isApplication = False
                        continue
                        #do not need to check the filecontent

                    #otherwise check the filecontent
                    for unkregex in JsDecompose.unknownLibraryIdentifier:
                        #print("Analysing for regex: {}".format(unkregex))
                        unknownLibraryMatched = unkregex.search(data)
                        if unknownLibraryMatched:
                            #check whether this library is actually unknown, by comparing it with identified dicts
                            #unkLib = unknownLibraryMatched.group(1).lower().decode()
                            unkVer = unknownLibraryMatched.group(2).decode()
                            unkjsFile = ((jsFile.filename).replace(
                                '.js', '')).replace('.min', '')

                            if (not self.isLibExistInList(
                                    unkjsFile, unkVer,
                                    self.identifiedKnLibrariesList)):
                                #put this unknown library in the unknown dictionary. use the filename instead - safer
                                self.identifiedUnLibrariesList.append({
                                    'lib': unkjsFile,
                                    'ver': unkVer,
                                    'detectMethod': self.DetectionType.
                                                    FILENAME_FILECONTENT.name,
                                    'type': "likely_library",
                                    'jsFilename': os.path.basename(jsFile.filename),
                                    'md5': md5,
                                    'size': int(jsFile.file_size),
                                    'path': jsFile.filename
                                })
                            isApplication = False
                            break
                            #do not need to check the rest of the unknown regexes

            #if none of the above regexes match, then it is likely an application
            if isApplication:
                self.identifiedApplicationsList.append({
                                    'lib': None,
                                    'ver': None,
                                    'detectMethod': None,
                                    'type': "application",
                                    'jsFilename': os.path.basename(jsFile.filename),
                                    'md5': md5,
                                    'size': int(jsFile.file_size),
                                    'path': jsFile.filename
                                })

        return (self.identifiedKnLibrariesList + self.identifiedUnLibrariesList +
                self.identifiedApplicationsList)

    def isLibExistInList(self, lib, ver, listOfDict):
        for item in listOfDict:
            if (item['lib'].lower() == lib.lower() and
                    item['ver'].lower() == ver.lower()):
                return True
        return False
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#!/usr/bin/env python3`
			`#`
			`# Copyright (C) 2016,2017 The University of Sheffield, UK`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

			`"""Python analys providing a decomposition analysis of JavaScript code in`
			`general and Chrome extensions in particular."""`

			`import os`
			`import re`
			`import json`
			`from enum import Enum`
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`import hashlib`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00

Cleanup. 2017-08-23 18:17:35 +00:00			`class JsDecompose:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`##Class variables- whose values are shared among 'all' instances of this 'class'`
			`regexFile = os.path.join(`
			`os.path.dirname(os.path.realpath(__file__)), '../resources/',`
			`'js_identifier.json') ##full path`

			`with open(regexFile, 'r') as fObject:`
			`#read the whole file content as a string`
			`jString = fObject.read()`

			`libraryIdentifiers = json.loads(jString)`

			`unknownFilenameIdentifier = re.compile(`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`r'(.+)[\-\_]([0-9]{1,2}[\.\|\-\|\_][0-9a-z]{1,2}[\.\|\-\|\_][0-9a-z\-\_]*)',`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`re.IGNORECASE)`

			`#this will be used, when no known library is found`
			`unknownLibraryIdentifier = [`
			`re.compile(`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`rb'[\/\|\/\/\|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.\|\-\|\_][0-9.a-z_\\\\-]+)',`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`re.IGNORECASE`
			`), #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8`
			`re.compile(`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`rb'[\/\|\/\/\|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`re.IGNORECASE`
			`), #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8`
			`re.compile(`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`rb'\@*(version)\s?[\:\|-]?\s?v?([0-9][\.\|\-\|\_][0-9.a-z_\\\\-]+)',`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`re.IGNORECASE`
			`), #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.`
			`re.compile(`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`rb'(version)[\:\|\=]\s?.?([0-9]{1,2}[\.\|\-\|\_][0-9.a-z_\\\\-]+).?',`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`re.IGNORECASE),`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`re.compile(rb'(.+) v([0-9]{1,2}[\.\|\-\|\_][0-9]{1,2}[0-9.a-z_\\\\-]*)',`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`re.IGNORECASE)`
			`]`

			`#if even the generic regex is not matched, check if the line contains the string "version"`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`#versionOnlyIdentifier =`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00
			`DetectionType = Enum("DetectionType",`
			`'FILENAME FILECONTENT FILENAME_FILECONTENT URL HASH')`

			`#Class constructor initialiser - is called when a new instance of this class is created`
Cleanup. 2017-08-23 18:17:35 +00:00			`def __init__(self):`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`self.identifiedKnLibrariesList = []`
			`self.identifiedUnknownLibrariesDict = {}`
			`self.identifiedUnLibrariesList = []`
			`self.identifiedApplicationsList = []`
Cleanup. 2017-08-23 18:17:35 +00:00
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`def detectLibraries(self, zipfile):`
			`jsFiles = list(filter(lambda x: x.filename.endswith(".js"),`
			`zipfile.infolist()))`

Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#for each jsFile path in the list`
Cleanup. 2017-08-23 18:17:35 +00:00			`for jsFile in list(jsFiles):`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#check whether the file is empty`

			`isApplication = True`
Cleanup. 2017-08-23 18:17:35 +00:00			`with zipfile.open(jsFile) as fObject:`
			`data = fObject.read()`
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`md5 = hashlib.md5(data).hexdigest()`
Cleanup. 2017-08-23 18:17:35 +00:00
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`libraryIdentified = False`

			`#iterate over the library regexes, to check whether it has a match`
Cleanup. 2017-08-23 18:17:35 +00:00			`for lib, regex in JsDecompose.libraryIdentifiers.items():`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`##METHOD_1: Read the filename of this file`
			`#if it matches to one of the defined filename regex, store in the dict`
			`#check if there is a filename regex exists for this lib`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`if 'filename' in regex:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`filenameMatched = re.search(regex['filename'],`
Cleanup. 2017-08-23 18:17:35 +00:00			`jsFile.filename,`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`re.IGNORECASE)`

Minor code cleanup. 2017-08-23 16:36:41 +00:00			`if filenameMatched:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#check whether this lib has already been identified in the dict, otherwise store the libname and version from the filename`
			`ver = filenameMatched.group(2)`
			`self.identifiedKnLibrariesList.append({`
			`'lib': lib,`
			`'ver': ver,`
			`'detectMethod':`
			`self.DetectionType.FILENAME.name,`
			`'jsFilename': os.path.basename(jsFile),`
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`'md5': md5,`
			`'size': int(jsFile.file_size),`
Cleanup. 2017-08-23 18:17:35 +00:00			`'path': jsFile.filename`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`})`
			`libraryIdentified = True`
			`isApplication = False`

			`##METHOD_2: Check content of every .js file`
			`#check if there is filecontent regex exists for this lib`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`if 'filecontent' in regex:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#iterate over the filecontent regexes for this lib to see if it has a match`
			`for aFilecontent in regex['filecontent']:`
			`libraryMatched = re.search(aFilecontent.encode(),`
			`data, re.IGNORECASE)`
			`if (libraryMatched):`
			`ver = libraryMatched.group(2).decode()`
			`if (not self.isLibExistInList(`
			`lib, ver,`
			`self.identifiedKnLibrariesList)):`
			`#to be safe, check if the version in the filename, matches with the filecontent`
			`self.identifiedKnLibrariesList.append({`
			`'lib': lib,`
			`'ver': ver,`
			`'detectMethod':`
			`self.DetectionType.FILECONTENT.name,`
			`'jsFilename': os.path.basename(jsFile),`
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`'md5': md5,`
			`'size': int(jsFile.file_size),`
Cleanup. 2017-08-23 18:17:35 +00:00			`'path': jsFile.filename`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`})`

			`libraryIdentified = True`
			`isApplication = False`
			`break`
			`#do not need to check the other regex for this library - since its already found`

			`#if none of the regexes in the repository match, check whether the unknown regexes match`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`if not libraryIdentified:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#check the filename`
			`unkFilenameMatch = self.unknownFilenameIdentifier.search(`
Cleanup. 2017-08-23 18:17:35 +00:00			`jsFile.filename)`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`if unkFilenameMatch:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`self.identifiedUnLibrariesList.append({`
			`'lib': unkFilenameMatch.group(1),`
			`'ver': unkFilenameMatch.group(2),`
			`'detectMethod': self.DetectionType.FILENAME.name,`
Cleanup. 2017-08-23 18:17:35 +00:00			`'type': "library",`
			`'jsFilename': os.path.basename(jsFile.filename),`
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`'md5': md5,`
			`'size': int(jsFile.file_size),`
Cleanup. 2017-08-23 18:17:35 +00:00			`'path': jsFile.filename`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`})`
			`isApplication = False`
			`continue`
			`#do not need to check the filecontent`

			`#otherwise check the filecontent`
Cleanup. 2017-08-23 18:17:35 +00:00			`for unkregex in JsDecompose.unknownLibraryIdentifier:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#print("Analysing for regex: {}".format(unkregex))`
			`unknownLibraryMatched = unkregex.search(data)`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`if unknownLibraryMatched:`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`#check whether this library is actually unknown, by comparing it with identified dicts`
			`#unkLib = unknownLibraryMatched.group(1).lower().decode()`
			`unkVer = unknownLibraryMatched.group(2).decode()`
Cleanup. 2017-08-23 18:17:35 +00:00			`unkjsFile = ((jsFile.filename).replace(`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`'.js', '')).replace('.min', '')`

			`if (not self.isLibExistInList(`
			`unkjsFile, unkVer,`
			`self.identifiedKnLibrariesList)):`
			`#put this unknown library in the unknown dictionary. use the filename instead - safer`
			`self.identifiedUnLibrariesList.append({`
			`'lib': unkjsFile,`
			`'ver': unkVer,`
			`'detectMethod': self.DetectionType.`
Minor code cleanup. 2017-08-23 16:36:41 +00:00			`FILENAME_FILECONTENT.name,`
Cleanup. 2017-08-23 18:17:35 +00:00			`'type': "likely_library",`
			`'jsFilename': os.path.basename(jsFile.filename),`
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`'md5': md5,`
			`'size': int(jsFile.file_size),`
Cleanup. 2017-08-23 18:17:35 +00:00			`'path': jsFile.filename`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`})`
			`isApplication = False`
			`break`
			`#do not need to check the rest of the unknown regexes`

Minor code cleanup. 2017-08-23 16:36:41 +00:00			`#if none of the above regexes match, then it is likely an application`
			`if isApplication:`
Cleanup. 2017-08-23 18:17:35 +00:00			`self.identifiedApplicationsList.append({`
			`'lib': None,`
			`'ver': None,`
			`'detectMethod': None,`
			`'type': "application",`
			`'jsFilename': os.path.basename(jsFile.filename),`
Integrated JavaScript decomposition analysis. 2017-08-23 18:42:00 +00:00			`'md5': md5,`
			`'size': int(jsFile.file_size),`
Cleanup. 2017-08-23 18:17:35 +00:00			`'path': jsFile.filename`
			`})`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00
Cleanup. 2017-08-23 18:17:35 +00:00			`return (self.identifiedKnLibrariesList + self.identifiedUnLibrariesList +`
Initial import of JavaScript decomposition framework. 2017-08-23 16:22:58 +00:00			`self.identifiedApplicationsList)`

			`def isLibExistInList(self, lib, ver, listOfDict):`
			`for item in listOfDict:`
			`if (item['lib'].lower() == lib.lower() and`
			`item['ver'].lower() == ver.lower()):`
			`return True`
Cleanup. 2017-08-23 18:17:35 +00:00			`return False`