Refactoring.

This commit is contained in:
Achim D. Brucker 2017-08-24 00:44:34 +01:00
parent 920bc74838
commit 486b967d2d
1 changed files with 54 additions and 45 deletions

View File

@ -14,7 +14,6 @@
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Python analys providing a decomposition analysis of JavaScript code in """Python analys providing a decomposition analysis of JavaScript code in
general and Chrome extensions in particular.""" general and Chrome extensions in particular."""
@ -24,6 +23,7 @@ import json
from enum import Enum from enum import Enum
import hashlib import hashlib
def lib_identifiers(): def lib_identifiers():
"""Initialize identifiers for known libraries from JSON file.""" """Initialize identifiers for known libraries from JSON file."""
regex_file = os.path.join( regex_file = os.path.join(
@ -33,12 +33,14 @@ def lib_identifiers():
json_content = json_file.read() json_content = json_file.read()
return json.loads(json_content) return json.loads(json_content)
def unknown_filename_identifier(): def unknown_filename_identifier():
"""Identifier for extracting version information from unknown/generic file names.""" """Identifier for extracting version information from unknown/generic file names."""
return re.compile( return re.compile(
r'(.+)[\-\_]([0-9]{1,2}[\.|\-|\_][0-9a-z]{1,2}[\.|\-|\_][0-9a-z\-\_]*)', r'(.+)[\-\_]([0-9]{1,2}[\.|\-|\_][0-9a-z]{1,2}[\.|\-|\_][0-9a-z\-\_]*)',
re.IGNORECASE) re.IGNORECASE)
def lib_isin_list(lib, ver, lib_list): def lib_isin_list(lib, ver, lib_list):
"""Check if a specific library/version has already been detected.""" """Check if a specific library/version has already been detected."""
for item in lib_list: for item in lib_list:
@ -47,6 +49,7 @@ def lib_isin_list(lib, ver, lib_list):
return True return True
return False return False
def unknown_lib_identifiers(): def unknown_lib_identifiers():
"""List of identifiers for generic library version headers.""" """List of identifiers for generic library version headers."""
return ([ return ([
@ -69,13 +72,14 @@ def unknown_lib_identifiers():
re.IGNORECASE) re.IGNORECASE)
]) ])
def detectLibraries(zipfile): def detectLibraries(zipfile):
"""JavaScript decomposition analysis for extensions.""" """JavaScript decomposition analysis for extensions."""
detection_type = Enum("DetectionType", detection_type = Enum("DetectionType",
'FILENAME FILECONTENT FILENAME_FILECONTENT URL HASH') 'FILENAME FILECONTENT FILENAME_FILECONTENT URL HASH')
known_libs = [] known_libs = []
unkown_libs = [] unkown_libs = []
identifiedApplicationsList = [] app_js = []
js_files = list( js_files = list(
filter(lambda x: x.filename.endswith(".js"), zipfile.infolist())) filter(lambda x: x.filename.endswith(".js"), zipfile.infolist()))
@ -86,15 +90,16 @@ def detectLibraries(zipfile):
with zipfile.open(js_file) as js_file_obj: with zipfile.open(js_file) as js_file_obj:
data = js_file_obj.read() data = js_file_obj.read()
js_info = {'lib': None, js_info = {
'ver': None, 'lib': None,
'detectMethod': None, 'ver': None,
'type': None, 'detectMethod': None,
'jsFilename': os.path.basename(js_file.filename), 'type': None,
'md5': hashlib.md5(data).hexdigest(), 'jsFilename': os.path.basename(js_file.filename),
'size': int(js_file.file_size), 'md5': hashlib.md5(data).hexdigest(),
'path': js_file.filename 'size': int(js_file.file_size),
} 'path': js_file.filename
}
lib_identified = False lib_identified = False
@ -104,13 +109,14 @@ def detectLibraries(zipfile):
#if it matches to one of the defined filename regex, store in the dict #if it matches to one of the defined filename regex, store in the dict
#check if there is a filename regex exists for this lib #check if there is a filename regex exists for this lib
if 'filename' in regex: if 'filename' in regex:
filenameMatched = re.search(regex['filename'], filename_matched = re.search(regex['filename'],
js_file.filename, re.IGNORECASE) js_file.filename, re.IGNORECASE)
if filenameMatched: if filename_matched:
#check whether this lib has already been identified in the dict, otherwise store the libname and version from the filename # check whether this lib has already been identified in the dict,
# otherwise store the libname and version from the filename
js_info['lib'] = lib js_info['lib'] = lib
js_info['ver'] = filenameMatched.group(2) js_info['ver'] = filename_matched.group(2)
js_info['type'] = "library" js_info['type'] = "library"
js_info['detectMethod'] = detection_type.FILENAME.name js_info['detectMethod'] = detection_type.FILENAME.name
known_libs.append(js_info) known_libs.append(js_info)
@ -121,32 +127,34 @@ def detectLibraries(zipfile):
#check if there is filecontent regex exists for this lib #check if there is filecontent regex exists for this lib
if 'filecontent' in regex: if 'filecontent' in regex:
#iterate over the filecontent regexes for this to see if it has a match #iterate over the filecontent regexes for this to see if it has a match
for aFilecontent in regex['filecontent']: for file_content in regex['filecontent']:
libraryMatched = re.search(aFilecontent.encode(), data, lib_matched = re.search(file_content.encode(), data,
re.IGNORECASE) re.IGNORECASE)
if libraryMatched: if lib_matched:
ver = libraryMatched.group(2).decode() ver = lib_matched.group(2).decode()
if (not lib_isin_list( if not lib_isin_list(lib, ver, known_libs):
lib, ver, known_libs)):
js_info['lib'] = lib js_info['lib'] = lib
js_info['ver'] = ver js_info['ver'] = ver
js_info['type'] = "library" js_info['type'] = "library"
js_info['detectMethod'] = detection_type.FILECONTENT.name js_info[
known_libs.append(js_info) 'detectMethod'] = detection_type.FILECONTENT.name
known_libs.append(js_info)
lib_identified = True lib_identified = True
is_app_js = False is_app_js = False
break break
#do not need to check the other regex for this library - since its already found # do not need to check the other regex for this library,
# since its already found
#if none of the regexes in the repository match, check whether the unknown regexes match #if none of the regexes in the repository match, check whether the unknown
# regexes match
if not lib_identified: if not lib_identified:
#check the filename #check the filename
unkFilenameMatch = unknown_filename_identifier().search( unknown_filename_match = unknown_filename_identifier().search(
js_file.filename) js_file.filename)
if unkFilenameMatch: if unknown_filename_match:
js_info['lib'] = unkFilenameMatch.group(1) js_info['lib'] = unknown_filename_match.group(1)
js_info['ver'] = unkFilenameMatch.group(2) js_info['ver'] = unknown_filename_match.group(2)
js_info['type'] = "likely_library" js_info['type'] = "likely_library"
js_info['detectMethod'] = detection_type.FILENAME.name js_info['detectMethod'] = detection_type.FILENAME.name
unkown_libs.append(js_info) unkown_libs.append(js_info)
@ -157,20 +165,22 @@ def detectLibraries(zipfile):
#otherwise check the filecontent #otherwise check the filecontent
for unkregex in unknown_lib_identifiers(): for unkregex in unknown_lib_identifiers():
#print("Analysing for regex: {}".format(unkregex)) #print("Analysing for regex: {}".format(unkregex))
unknownLibraryMatched = unkregex.search(data) unkown_lib_matched = unkregex.search(data)
if unknownLibraryMatched: if unkown_lib_matched:
#check whether this library is actually unknown, by comparing it with identified dicts #check whether this library is actually unknown, by comparing it with
#unkLib = unknownLibraryMatched.group(1).lower().decode() # identified dicts
unkVer = unknownLibraryMatched.group(2).decode() unkown_version = unkown_lib_matched.group(2).decode()
unkjsFile = ((js_file.filename).replace( unkown_js_file = ((js_file.filename).replace(
'.js', '')).replace('.min', '') '.js', '')).replace('.min', '')
if (not lib_isin_list(unkjsFile, unkVer, if (not lib_isin_list(unkown_js_file, unkown_version,
known_libs)): known_libs)):
#put this unknown library in the unknown dictionary. use the filename instead - safer #put this unknown library in the unknown dictionary. use the filename
js_info['lib'] = unkjsFile # instead - safer
js_info['ver'] = unkVer js_info['lib'] = unkown_js_file
js_info['detectMethod'] = detection_type.FILENAME_FILECONTENT.name js_info['ver'] = unkown_version
js_info[
'detectMethod'] = detection_type.FILENAME_FILECONTENT.name
js_info['type'] = "likely_library" js_info['type'] = "likely_library"
unkown_libs.append(js_info) unkown_libs.append(js_info)
is_app_js = False is_app_js = False
@ -183,7 +193,6 @@ def detectLibraries(zipfile):
js_info['ver'] = None js_info['ver'] = None
js_info['detectMethod'] = None js_info['detectMethod'] = None
js_info['type'] = "application" js_info['type'] = "application"
identifiedApplicationsList.append(js_info) app_js.append(js_info)
return (known_libs + unkown_libs + return known_libs + unkown_libs + app_js
identifiedApplicationsList)