From 6afc1429ed47160891282fade51ea65ecbd59489 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Thu, 14 Feb 2019 11:50:49 +0000 Subject: [PATCH] Small changes to extgrep. --- ExtensionCrawler/archive.py | 7 ++- extgrep | 106 ++++++++++++++++-------------------- 2 files changed, 54 insertions(+), 59 deletions(-) diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 52522ab..165dec2 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -36,6 +36,7 @@ import datetime import dateutil import dateutil.parser import requests +from itertools import groupby from ExtensionCrawler.config import ( const_review_payload, const_review_search_url, const_download_url, @@ -638,7 +639,8 @@ def iter_tar_entries_from_file_ext(archivedir, extid, ext): tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext) with tarfile.open(tar, 'r') as tf: for tarentry in tf: - yield (tarentry, tf.extractfile(tarentry)) + if tarentry.isfile(): + yield (tarentry, tf.extractfile(tarentry)) def iter_tar_entries(archivedir, extid): for i in range(1000): @@ -651,3 +653,6 @@ def iter_tar_entries(archivedir, extid): ext = ".tar" for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext): yield (tarentry, tarfile) + +def iter_tar_entries_by_date(archivedir, extid): + return groupby(iter_tar_entries(archivedir, extid), lambda tup: tup[0].name.split("/")[1]) diff --git a/extgrep b/extgrep index c486af2..9f4839e 100755 --- a/extgrep +++ b/extgrep @@ -28,7 +28,7 @@ import importlib.util from zipfile import ZipFile from ExtensionCrawler.config import (const_log_format, const_basedir) -from ExtensionCrawler.archive import iter_tar_entries +from ExtensionCrawler.archive import iter_tar_entries, iter_tar_entries_by_date from ExtensionCrawler.js_mincer import mince_js @@ -45,24 +45,22 @@ def import_regexs(path): spec.loader.exec_module(module) return module -def get_etag(header_tarentry): - headers_content = header_tarentry.read().decode().replace( +def get_etag(headers_content): + headers_content = headers_content.replace( '"', '\\"').replace("'", '"') headers_json = json.loads(headers_content) if "ETag" in headers_json: return headers_json["ETag"] -def get_name_and_version(overview_tarentry): - contents = overview_tarentry.read().decode() - +def get_name_and_version(overview_contents): # Extract extension name match = re.search("""""", - contents) + overview_contents) name = match.group(1) if match else None # Extract extension version match = re.search( - """""", contents) + """""", overview_contents) version = match.group(1) if match else None return name, version @@ -70,71 +68,63 @@ def get_name_and_version(overview_tarentry): def handle_extid(conf, extid): miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings() - results = {} + results = [] still_in_store = None crx_etags = [None] - for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid): - if tarentry.isdir(): - continue - date = tarentry.name.split("/")[1] + for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid): if conf.from_date and not (conf.from_date <= date): continue if conf.latest_date and not (date <= conf.latest_date): continue - if date not in results: - results[date] = {} - results[date]["crx_etag"] = None - results[date]["name"] = None - results[date]["version"] = None - results[date]["matches"] = [] + crx_etag = None + name = None + version = None + matches = [] + for tarentry, tarfile in tups: + tarentry_filename = tarentry.name.split("/")[-1] - tar_file_name = tarentry.name.split("/")[-1] + if tarentry_filename.endswith(".crx.headers"): + crx_etag = get_etag(tarfile.read().decode()) + if crx_etag: + crx_etags += [crx_etag] - if tar_file_name.endswith(".crx.headers"): - crx_etag = get_etag(tarfile) - results[date]["crx_etag"] = crx_etag - if crx_etag: - crx_etags += [crx_etag] + if tarentry_filename == "overview.html": + name, version = get_name_and_version(tarfile.read().decode()) - if tar_file_name == "overview.html": - results[date]["name"], results[date]["version"] = get_name_and_version(tarfile) + if tarentry_filename == "overview.html.status": + still_in_store = tarfile.read().decode().startswith("2") - if tar_file_name == "overview.html.status": - still_in_store = tarfile.read().decode().startswith("2") + if tarentry_filename.endswith(".crx"): + with ZipFile(tarfile) as zf: + for zipentry in zf.infolist(): + if is_source_file(zipentry): + with zf.open(zipentry) as f: + for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): + file_lines = [] + file_lines += block.content.splitlines() + file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines() - if tar_file_name.endswith(".crx"): - with ZipFile(tarfile) as zf: - for zipentry in zf.infolist(): - if is_source_file(zipentry): - with zf.open(zipentry) as f: - for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): + for search_tag in miner_strings.strings.keys(): + for search_string in miner_strings.strings[search_tag]: + for line in file_lines: + if search_string in line: + matches += [[zipentry.filename, search_tag, search_string]] + break - file_lines = [] - file_lines += block.content.splitlines() - file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines() + for search_tag in miner_strings.patterns.keys(): + for search_pattern in miner_strings.patterns[search_tag]: + for line in file_lines: + m = re.search(search_pattern, line) + if m: + matches += [[zipentry.filename, search_tag, m.group()]] + break + for match in matches: + results += [[date, crx_etag, name, version] + match] - for search_tag in miner_strings.strings.keys(): - for search_string in miner_strings.strings[search_tag]: - for line in file_lines: - if search_string in line: - results[date]["matches"] += [[zipentry.filename, search_tag, search_string]] - break - - for search_tag in miner_strings.patterns.keys(): - for search_pattern in miner_strings.patterns[search_tag]: - for line in file_lines: - m = re.search(search_pattern, line) - if m: - results[date]["matches"] += [[zipentry.filename, search_tag, m.group()]] - break - #for extid, still_in_store, most_recent_crx_etag, date, crx_etag, name, version, path, tag, match - - for date in sorted(results.keys()): - result = results[date] - for match in result["matches"]: - print("|".join([str(x) for x in ([extid, still_in_store, crx_etags[-1], date, result["crx_etag"], result["name"], result["version"]] + match)])) + for result in results: + print("|".join([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])) def main(conf):