Basic support for tar files and extensions ids.
This commit is contained in:
parent
712bfff805
commit
e07cd21cfc
|
@ -20,23 +20,28 @@
|
|||
import getopt
|
||||
import io
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import zlib
|
||||
from io import StringIO
|
||||
from zipfile import ZipFile
|
||||
import collections
|
||||
import tarfile
|
||||
import datetime
|
||||
import dateutil
|
||||
import dateutil.parser
|
||||
|
||||
import cchardet as chardet
|
||||
import jsbeautifier
|
||||
|
||||
from ExtensionCrawler.js_decomposer import init_file_info
|
||||
from ExtensionCrawler.js_mincer import JsBlockType, mince_js
|
||||
from ExtensionCrawler.js_mincer import mince_js
|
||||
from ExtensionCrawler.config import const_basedir, get_local_archive_dir
|
||||
from ExtensionCrawler.archive import last_crx
|
||||
|
||||
# Script should run with python 3.4 or 3.5
|
||||
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
|
||||
|
||||
JsStringsConfig = collections.namedtuple('JsStringsConfig', [
|
||||
'comment', 'strings', 'group', 'program', 'beautify', 'regexp'
|
||||
'comment', 'strings', 'group', 'program', 'beautify', 'basedir', 'regexp'
|
||||
])
|
||||
|
||||
|
||||
|
@ -57,7 +62,7 @@ def jsstrings_data(path, data, config):
|
|||
if config.beautify:
|
||||
str_data = jsbeautifier.beautify(str_data)
|
||||
|
||||
with StringIO(str_data) as str_obj:
|
||||
with io.StringIO(str_data) as str_obj:
|
||||
for block in mince_js(
|
||||
str_obj, single_line_comments_block=config.group):
|
||||
print_block(config.comment, config.program, config.strings,
|
||||
|
@ -73,6 +78,7 @@ def helpmsg():
|
|||
print(" -g group single line comments")
|
||||
print(" -c program code")
|
||||
print(" -b beautify JavaScript files before analyzing them")
|
||||
print(" -a=<DIR> archive directory")
|
||||
print(
|
||||
" -r regexp select only comments/code/strings where regexp matches")
|
||||
print(
|
||||
|
@ -98,6 +104,42 @@ def print_block(comment, program, strings, regexp, block):
|
|||
print(string)
|
||||
|
||||
|
||||
def analyze_crx(config, crx, path):
|
||||
if path is None:
|
||||
with ZipFile(crx) as crxobj:
|
||||
js_files = list(
|
||||
filter(
|
||||
lambda x: x.filename.endswith(".js") or x.filename.endswith(".css"),
|
||||
crxobj.infolist()))
|
||||
for jsfile in js_files:
|
||||
with crxobj.open(jsfile) as js_file_obj:
|
||||
data = js_file_obj.read()
|
||||
path = js_file_obj.name
|
||||
jsstrings_data(path, data, config)
|
||||
else:
|
||||
with ZipFile(crx) as crxobj:
|
||||
with crxobj.open(path) as js_file:
|
||||
data = js_file.read()
|
||||
jsstrings_data(path, data, config)
|
||||
|
||||
|
||||
def analyze_tar(config, date, filename, path):
|
||||
last_crx_file = ''
|
||||
extid = os.path.splitext(os.path.basename(filename))[0]
|
||||
if date is not None:
|
||||
dateobj = dateutil.parser.parse(date)
|
||||
if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
|
||||
dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
|
||||
last_crx_file = last_crx(
|
||||
os.path.join(config.basedir, "data"), extid, dateobj)
|
||||
else:
|
||||
last_crx_file = last_crx(os.path.join(config.basedir, "data"), extid)
|
||||
print(filename, last_crx_file)
|
||||
with tarfile.open(filename, 'r') as archive:
|
||||
with archive.extractfile(last_crx_file) as crx:
|
||||
analyze_crx(config, crx, path)
|
||||
|
||||
|
||||
def main(argv):
|
||||
"""Main function: JavaScript strings on steroids."""
|
||||
config = JsStringsConfig(
|
||||
|
@ -106,6 +148,7 @@ def main(argv):
|
|||
group=False,
|
||||
program=False,
|
||||
beautify=False,
|
||||
basedir=const_basedir(),
|
||||
regexp=None)
|
||||
|
||||
filename = None
|
||||
|
@ -113,8 +156,9 @@ def main(argv):
|
|||
date = None
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "hbcd:snvr:",
|
||||
["--regesp", "--date", "--beautify"])
|
||||
opts, args = getopt.getopt(argv, "hbcd:sna:vr:", [
|
||||
"--regesp", "--date", "--archive", "--beautify"
|
||||
])
|
||||
except getopt.GetoptError:
|
||||
helpmsg()
|
||||
sys.exit(2)
|
||||
|
@ -122,6 +166,8 @@ def main(argv):
|
|||
if opt == '-h':
|
||||
helpmsg()
|
||||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
config = config._replace(basedir=arg)
|
||||
elif opt == '-n':
|
||||
config = config._replace(comment=False)
|
||||
elif opt == '-s':
|
||||
|
@ -145,25 +191,17 @@ def main(argv):
|
|||
helpmsg()
|
||||
sys.exit()
|
||||
|
||||
if filename.endswith('.crx') and path is not None:
|
||||
with ZipFile(filename) as crxobj:
|
||||
with crxobj.open(path) as js_file:
|
||||
data = js_file.read()
|
||||
jsstrings_data(path, data, config)
|
||||
elif filename.endswith('.crx') and path is None:
|
||||
with ZipFile(filename) as crxobj:
|
||||
js_files = list(
|
||||
filter(lambda x: x.filename.endswith(".js"),
|
||||
crxobj.infolist()))
|
||||
for jsfile in js_files:
|
||||
with crxobj.open(jsfile) as js_file_obj:
|
||||
data = js_file_obj.read()
|
||||
path = js_file_obj.name
|
||||
jsstrings_data(path, data, config)
|
||||
elif filename.endswith('.tar') and path is not None:
|
||||
pass
|
||||
elif filename.endswith('.tar') and path is None:
|
||||
pass
|
||||
extid_re = re.compile('^[a-p]+$')
|
||||
|
||||
if filename.endswith('.crx'):
|
||||
analyze_crx(config, filename, path)
|
||||
elif filename.endswith('.tar'):
|
||||
analyze_tar(config, date, filename, path)
|
||||
elif extid_re.match(filename):
|
||||
extid = filename
|
||||
filename = os.path.join(config.basedir, 'data',
|
||||
get_local_archive_dir(extid), extid + ".tar")
|
||||
analyze_tar(config, date, filename, path)
|
||||
else:
|
||||
with open(filename, 'rb') as fileobj:
|
||||
data = fileobj.read()
|
||||
|
|
Loading…
Reference in New Issue