Improved extgrep.
This commit is contained in:
parent
6afc1429ed
commit
9d615760f1
104
extgrep
104
extgrep
|
@ -24,20 +24,36 @@ import re
|
|||
import json
|
||||
import sys
|
||||
import importlib.util
|
||||
import csv
|
||||
import math
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
||||
from ExtensionCrawler.archive import iter_tar_entries, iter_tar_entries_by_date
|
||||
from ExtensionCrawler.archive import iter_tar_entries_by_date
|
||||
from ExtensionCrawler.js_mincer import mince_js
|
||||
|
||||
|
||||
def is_source_file(zipentry):
|
||||
"""Test if filename indicates file with C-style comment."""
|
||||
return (zipentry.filename.endswith(".js") or zipentry.filename.endswith(".js.gz")
|
||||
or zipentry.filename.endswith(".jgz") or zipentry.filename.endswith(".jsg")
|
||||
or zipentry.filename.endswith(".css.gz") or zipentry.filename.endswith(".c")
|
||||
or zipentry.filename.endswith(".cpp") or zipentry.filename.endswith(".java"))
|
||||
def get_shannon_entropy(string):
|
||||
"""
|
||||
This code has been borrowed from
|
||||
"http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
|
||||
"git@github.com:dxa4481/truffleHog.git"
|
||||
"""
|
||||
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||
if not string:
|
||||
return 0
|
||||
entropy = 0
|
||||
for x in chars:
|
||||
p_x = float(string.count(x))/len(string)
|
||||
if p_x > 0:
|
||||
entropy += - p_x*math.log(p_x, 2)
|
||||
return entropy
|
||||
|
||||
|
||||
def is_likely_hash(string):
|
||||
return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
|
||||
|
||||
|
||||
def import_regexs(path):
|
||||
spec = importlib.util.spec_from_file_location("MinerStrings", path)
|
||||
|
@ -45,6 +61,7 @@ def import_regexs(path):
|
|||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def get_etag(headers_content):
|
||||
headers_content = headers_content.replace(
|
||||
'"', '\\"').replace("'", '"')
|
||||
|
@ -52,6 +69,7 @@ def get_etag(headers_content):
|
|||
if "ETag" in headers_json:
|
||||
return headers_json["ETag"]
|
||||
|
||||
|
||||
def get_name_and_version(overview_contents):
|
||||
# Extract extension name
|
||||
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
||||
|
@ -65,7 +83,8 @@ def get_name_and_version(overview_contents):
|
|||
|
||||
return name, version
|
||||
|
||||
def handle_extid(conf, extid):
|
||||
|
||||
def handle_extid(conf, extid, csvwriter):
|
||||
miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
|
||||
|
||||
results = []
|
||||
|
@ -99,7 +118,7 @@ def handle_extid(conf, extid):
|
|||
if tarentry_filename.endswith(".crx"):
|
||||
with ZipFile(tarfile) as zf:
|
||||
for zipentry in zf.infolist():
|
||||
if is_source_file(zipentry):
|
||||
if zipentry.filename.endswith(".js"):
|
||||
with zf.open(zipentry) as f:
|
||||
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
|
||||
file_lines = []
|
||||
|
@ -118,13 +137,15 @@ def handle_extid(conf, extid):
|
|||
for line in file_lines:
|
||||
m = re.search(search_pattern, line)
|
||||
if m:
|
||||
matches += [[zipentry.filename, search_tag, m.group()]]
|
||||
break
|
||||
matched_string = m.group()
|
||||
if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
|
||||
matches += [[zipentry.filename, search_tag, matched_string]]
|
||||
break
|
||||
for match in matches:
|
||||
results += [[date, crx_etag, name, version] + match]
|
||||
|
||||
for result in results:
|
||||
print("|".join([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)]))
|
||||
csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
|
||||
|
||||
|
||||
def main(conf):
|
||||
|
@ -138,13 +159,13 @@ def main(conf):
|
|||
logger.setLevel(logging.WARNING)
|
||||
|
||||
with open(conf.EXTID_FILE) as f:
|
||||
print("|".join(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"]))
|
||||
csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
|
||||
csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"])
|
||||
for extid in [l.strip() for l in f.readlines()]:
|
||||
handle_extid(conf, extid)
|
||||
handle_extid(conf, extid, csvwriter)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def build_parser():
|
||||
main_parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
description='Grep for extensions.')
|
||||
|
@ -168,8 +189,8 @@ if __name__ == "__main__":
|
|||
metavar='DATE',
|
||||
type=str,
|
||||
help='select latest crx from tar, released before DATE.\n' +
|
||||
'Together with --from-date, specifies all crx released in specified\n'
|
||||
+ 'date range.')
|
||||
'Together with --from-date, specifies all crx released in specified\n' +
|
||||
'date range.')
|
||||
|
||||
main_parser.add_argument(
|
||||
'-d',
|
||||
|
@ -177,8 +198,8 @@ if __name__ == "__main__":
|
|||
metavar='DATE',
|
||||
type=str,
|
||||
help='select oldest crx from tar released after DATE.\n' +
|
||||
'Together with --latest-date, specifies all crx released in specified\n'
|
||||
+ 'date range.')
|
||||
'Together with --latest-date, specifies all crx released in specified\n' +
|
||||
'date range.')
|
||||
|
||||
main_parser.add_argument(
|
||||
'-a',
|
||||
|
@ -188,47 +209,12 @@ if __name__ == "__main__":
|
|||
default=const_basedir(),
|
||||
help='archive directory')
|
||||
|
||||
# comment_group = main_parser.add_argument_group('comment blocks')
|
||||
# comment_group.add_argument(
|
||||
# '-g',
|
||||
# '--group-single-line-comments',
|
||||
# help='Group consecutive singe-line comments into blocks')
|
||||
# comment_group.add_argument(
|
||||
# '-c',
|
||||
# '--reg-exp-comments',
|
||||
# metavar='REGEXP',
|
||||
# type=str,
|
||||
# nargs='+',
|
||||
# help='search comments for regular expression')
|
||||
return main_parser
|
||||
|
||||
# source_group = main_parser.add_argument_group('source blocks')
|
||||
# source_group.add_argument(
|
||||
# '-b',
|
||||
# '--beautify',
|
||||
# action='store_true',
|
||||
# default=False,
|
||||
# help='beautify source code')
|
||||
# source_group.add_argument(
|
||||
# '-s',
|
||||
# '--reg-exp-source',
|
||||
# metavar='REGEXP',
|
||||
# type=str,
|
||||
# nargs='+',
|
||||
# help='search source for regular expression')
|
||||
|
||||
# strings_group = main_parser.add_argument_group('string literals')
|
||||
# strings_group.add_argument(
|
||||
# '-j',
|
||||
# '--join-string-literals',
|
||||
# action='store_true',
|
||||
# help='join string literals (heuristic)')
|
||||
# strings_group.add_argument(
|
||||
# '-l',
|
||||
# '--reg-exp-string-literals',
|
||||
# metavar='REGEXP',
|
||||
# type=str,
|
||||
# nargs='+',
|
||||
# help='search string literals for regular expression')
|
||||
if __name__ == "__main__":
|
||||
main_parser = build_parser()
|
||||
|
||||
main_conf = main_parser.parse_args()
|
||||
|
||||
sys.exit(main(main_conf))
|
||||
|
|
Loading…
Reference in New Issue