Basic support for tar files and extensions ids.

This commit is contained in:
Achim D. Brucker 2017-09-21 22:23:50 +01:00
parent 712bfff805
commit e07cd21cfc
1 changed files with 64 additions and 26 deletions

View File

@ -20,23 +20,28 @@
import getopt
import io
import re
import os
import sys
import zlib
from io import StringIO
from zipfile import ZipFile
import collections
import tarfile
import datetime
import dateutil
import dateutil.parser
import cchardet as chardet
import jsbeautifier
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import JsBlockType, mince_js
from ExtensionCrawler.js_mincer import mince_js
from ExtensionCrawler.config import const_basedir, get_local_archive_dir
from ExtensionCrawler.archive import last_crx
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
JsStringsConfig = collections.namedtuple('JsStringsConfig', [
'comment', 'strings', 'group', 'program', 'beautify', 'regexp'
'comment', 'strings', 'group', 'program', 'beautify', 'basedir', 'regexp'
])
@ -57,7 +62,7 @@ def jsstrings_data(path, data, config):
if config.beautify:
str_data = jsbeautifier.beautify(str_data)
with StringIO(str_data) as str_obj:
with io.StringIO(str_data) as str_obj:
for block in mince_js(
str_obj, single_line_comments_block=config.group):
print_block(config.comment, config.program, config.strings,
@ -73,6 +78,7 @@ def helpmsg():
print(" -g group single line comments")
print(" -c program code")
print(" -b beautify JavaScript files before analyzing them")
print(" -a=<DIR> archive directory")
print(
" -r regexp select only comments/code/strings where regexp matches")
print(
@ -98,6 +104,42 @@ def print_block(comment, program, strings, regexp, block):
print(string)
def analyze_crx(config, crx, path):
if path is None:
with ZipFile(crx) as crxobj:
js_files = list(
filter(
lambda x: x.filename.endswith(".js") or x.filename.endswith(".css"),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
jsstrings_data(path, data, config)
else:
with ZipFile(crx) as crxobj:
with crxobj.open(path) as js_file:
data = js_file.read()
jsstrings_data(path, data, config)
def analyze_tar(config, date, filename, path):
last_crx_file = ''
extid = os.path.splitext(os.path.basename(filename))[0]
if date is not None:
dateobj = dateutil.parser.parse(date)
if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
last_crx_file = last_crx(
os.path.join(config.basedir, "data"), extid, dateobj)
else:
last_crx_file = last_crx(os.path.join(config.basedir, "data"), extid)
print(filename, last_crx_file)
with tarfile.open(filename, 'r') as archive:
with archive.extractfile(last_crx_file) as crx:
analyze_crx(config, crx, path)
def main(argv):
"""Main function: JavaScript strings on steroids."""
config = JsStringsConfig(
@ -106,6 +148,7 @@ def main(argv):
group=False,
program=False,
beautify=False,
basedir=const_basedir(),
regexp=None)
filename = None
@ -113,8 +156,9 @@ def main(argv):
date = None
try:
opts, args = getopt.getopt(argv, "hbcd:snvr:",
["--regesp", "--date", "--beautify"])
opts, args = getopt.getopt(argv, "hbcd:sna:vr:", [
"--regesp", "--date", "--archive", "--beautify"
])
except getopt.GetoptError:
helpmsg()
sys.exit(2)
@ -122,6 +166,8 @@ def main(argv):
if opt == '-h':
helpmsg()
sys.exit()
elif opt in ("-a", "--archive"):
config = config._replace(basedir=arg)
elif opt == '-n':
config = config._replace(comment=False)
elif opt == '-s':
@ -145,25 +191,17 @@ def main(argv):
helpmsg()
sys.exit()
if filename.endswith('.crx') and path is not None:
with ZipFile(filename) as crxobj:
with crxobj.open(path) as js_file:
data = js_file.read()
jsstrings_data(path, data, config)
elif filename.endswith('.crx') and path is None:
with ZipFile(filename) as crxobj:
js_files = list(
filter(lambda x: x.filename.endswith(".js"),
crxobj.infolist()))
for jsfile in js_files:
with crxobj.open(jsfile) as js_file_obj:
data = js_file_obj.read()
path = js_file_obj.name
jsstrings_data(path, data, config)
elif filename.endswith('.tar') and path is not None:
pass
elif filename.endswith('.tar') and path is None:
pass
extid_re = re.compile('^[a-p]+$')
if filename.endswith('.crx'):
analyze_crx(config, filename, path)
elif filename.endswith('.tar'):
analyze_tar(config, date, filename, path)
elif extid_re.match(filename):
extid = filename
filename = os.path.join(config.basedir, 'data',
get_local_archive_dir(extid), extid + ".tar")
analyze_tar(config, date, filename, path)
else:
with open(filename, 'rb') as fileobj:
data = fileobj.read()