Small changes to extgrep.
This commit is contained in:
parent
f2dd7e2642
commit
6afc1429ed
|
@ -36,6 +36,7 @@ import datetime
|
||||||
import dateutil
|
import dateutil
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
import requests
|
import requests
|
||||||
|
from itertools import groupby
|
||||||
|
|
||||||
from ExtensionCrawler.config import (
|
from ExtensionCrawler.config import (
|
||||||
const_review_payload, const_review_search_url, const_download_url,
|
const_review_payload, const_review_search_url, const_download_url,
|
||||||
|
@ -638,7 +639,8 @@ def iter_tar_entries_from_file_ext(archivedir, extid, ext):
|
||||||
tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
|
tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
|
||||||
with tarfile.open(tar, 'r') as tf:
|
with tarfile.open(tar, 'r') as tf:
|
||||||
for tarentry in tf:
|
for tarentry in tf:
|
||||||
yield (tarentry, tf.extractfile(tarentry))
|
if tarentry.isfile():
|
||||||
|
yield (tarentry, tf.extractfile(tarentry))
|
||||||
|
|
||||||
def iter_tar_entries(archivedir, extid):
|
def iter_tar_entries(archivedir, extid):
|
||||||
for i in range(1000):
|
for i in range(1000):
|
||||||
|
@ -651,3 +653,6 @@ def iter_tar_entries(archivedir, extid):
|
||||||
ext = ".tar"
|
ext = ".tar"
|
||||||
for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
|
for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
|
||||||
yield (tarentry, tarfile)
|
yield (tarentry, tarfile)
|
||||||
|
|
||||||
|
def iter_tar_entries_by_date(archivedir, extid):
|
||||||
|
return groupby(iter_tar_entries(archivedir, extid), lambda tup: tup[0].name.split("/")[1])
|
||||||
|
|
106
extgrep
106
extgrep
|
@ -28,7 +28,7 @@ import importlib.util
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
||||||
from ExtensionCrawler.archive import iter_tar_entries
|
from ExtensionCrawler.archive import iter_tar_entries, iter_tar_entries_by_date
|
||||||
from ExtensionCrawler.js_mincer import mince_js
|
from ExtensionCrawler.js_mincer import mince_js
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,24 +45,22 @@ def import_regexs(path):
|
||||||
spec.loader.exec_module(module)
|
spec.loader.exec_module(module)
|
||||||
return module
|
return module
|
||||||
|
|
||||||
def get_etag(header_tarentry):
|
def get_etag(headers_content):
|
||||||
headers_content = header_tarentry.read().decode().replace(
|
headers_content = headers_content.replace(
|
||||||
'"', '\\"').replace("'", '"')
|
'"', '\\"').replace("'", '"')
|
||||||
headers_json = json.loads(headers_content)
|
headers_json = json.loads(headers_content)
|
||||||
if "ETag" in headers_json:
|
if "ETag" in headers_json:
|
||||||
return headers_json["ETag"]
|
return headers_json["ETag"]
|
||||||
|
|
||||||
def get_name_and_version(overview_tarentry):
|
def get_name_and_version(overview_contents):
|
||||||
contents = overview_tarentry.read().decode()
|
|
||||||
|
|
||||||
# Extract extension name
|
# Extract extension name
|
||||||
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
||||||
contents)
|
overview_contents)
|
||||||
name = match.group(1) if match else None
|
name = match.group(1) if match else None
|
||||||
|
|
||||||
# Extract extension version
|
# Extract extension version
|
||||||
match = re.search(
|
match = re.search(
|
||||||
"""<meta itemprop="version" content="(.*?)"\s*/>""", contents)
|
"""<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
|
||||||
version = match.group(1) if match else None
|
version = match.group(1) if match else None
|
||||||
|
|
||||||
return name, version
|
return name, version
|
||||||
|
@ -70,71 +68,63 @@ def get_name_and_version(overview_tarentry):
|
||||||
def handle_extid(conf, extid):
|
def handle_extid(conf, extid):
|
||||||
miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
|
miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
|
||||||
|
|
||||||
results = {}
|
results = []
|
||||||
|
|
||||||
still_in_store = None
|
still_in_store = None
|
||||||
crx_etags = [None]
|
crx_etags = [None]
|
||||||
for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid):
|
for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
|
||||||
if tarentry.isdir():
|
|
||||||
continue
|
|
||||||
date = tarentry.name.split("/")[1]
|
|
||||||
if conf.from_date and not (conf.from_date <= date):
|
if conf.from_date and not (conf.from_date <= date):
|
||||||
continue
|
continue
|
||||||
if conf.latest_date and not (date <= conf.latest_date):
|
if conf.latest_date and not (date <= conf.latest_date):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if date not in results:
|
crx_etag = None
|
||||||
results[date] = {}
|
name = None
|
||||||
results[date]["crx_etag"] = None
|
version = None
|
||||||
results[date]["name"] = None
|
matches = []
|
||||||
results[date]["version"] = None
|
for tarentry, tarfile in tups:
|
||||||
results[date]["matches"] = []
|
tarentry_filename = tarentry.name.split("/")[-1]
|
||||||
|
|
||||||
tar_file_name = tarentry.name.split("/")[-1]
|
if tarentry_filename.endswith(".crx.headers"):
|
||||||
|
crx_etag = get_etag(tarfile.read().decode())
|
||||||
|
if crx_etag:
|
||||||
|
crx_etags += [crx_etag]
|
||||||
|
|
||||||
if tar_file_name.endswith(".crx.headers"):
|
if tarentry_filename == "overview.html":
|
||||||
crx_etag = get_etag(tarfile)
|
name, version = get_name_and_version(tarfile.read().decode())
|
||||||
results[date]["crx_etag"] = crx_etag
|
|
||||||
if crx_etag:
|
|
||||||
crx_etags += [crx_etag]
|
|
||||||
|
|
||||||
if tar_file_name == "overview.html":
|
if tarentry_filename == "overview.html.status":
|
||||||
results[date]["name"], results[date]["version"] = get_name_and_version(tarfile)
|
still_in_store = tarfile.read().decode().startswith("2")
|
||||||
|
|
||||||
if tar_file_name == "overview.html.status":
|
if tarentry_filename.endswith(".crx"):
|
||||||
still_in_store = tarfile.read().decode().startswith("2")
|
with ZipFile(tarfile) as zf:
|
||||||
|
for zipentry in zf.infolist():
|
||||||
|
if is_source_file(zipentry):
|
||||||
|
with zf.open(zipentry) as f:
|
||||||
|
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
|
||||||
|
file_lines = []
|
||||||
|
file_lines += block.content.splitlines()
|
||||||
|
file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
|
||||||
|
|
||||||
if tar_file_name.endswith(".crx"):
|
for search_tag in miner_strings.strings.keys():
|
||||||
with ZipFile(tarfile) as zf:
|
for search_string in miner_strings.strings[search_tag]:
|
||||||
for zipentry in zf.infolist():
|
for line in file_lines:
|
||||||
if is_source_file(zipentry):
|
if search_string in line:
|
||||||
with zf.open(zipentry) as f:
|
matches += [[zipentry.filename, search_tag, search_string]]
|
||||||
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
|
break
|
||||||
|
|
||||||
file_lines = []
|
for search_tag in miner_strings.patterns.keys():
|
||||||
file_lines += block.content.splitlines()
|
for search_pattern in miner_strings.patterns[search_tag]:
|
||||||
file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
|
for line in file_lines:
|
||||||
|
m = re.search(search_pattern, line)
|
||||||
|
if m:
|
||||||
|
matches += [[zipentry.filename, search_tag, m.group()]]
|
||||||
|
break
|
||||||
|
for match in matches:
|
||||||
|
results += [[date, crx_etag, name, version] + match]
|
||||||
|
|
||||||
for search_tag in miner_strings.strings.keys():
|
for result in results:
|
||||||
for search_string in miner_strings.strings[search_tag]:
|
print("|".join([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)]))
|
||||||
for line in file_lines:
|
|
||||||
if search_string in line:
|
|
||||||
results[date]["matches"] += [[zipentry.filename, search_tag, search_string]]
|
|
||||||
break
|
|
||||||
|
|
||||||
for search_tag in miner_strings.patterns.keys():
|
|
||||||
for search_pattern in miner_strings.patterns[search_tag]:
|
|
||||||
for line in file_lines:
|
|
||||||
m = re.search(search_pattern, line)
|
|
||||||
if m:
|
|
||||||
results[date]["matches"] += [[zipentry.filename, search_tag, m.group()]]
|
|
||||||
break
|
|
||||||
#for extid, still_in_store, most_recent_crx_etag, date, crx_etag, name, version, path, tag, match
|
|
||||||
|
|
||||||
for date in sorted(results.keys()):
|
|
||||||
result = results[date]
|
|
||||||
for match in result["matches"]:
|
|
||||||
print("|".join([str(x) for x in ([extid, still_in_store, crx_etags[-1], date, result["crx_etag"], result["name"], result["version"]] + match)]))
|
|
||||||
|
|
||||||
|
|
||||||
def main(conf):
|
def main(conf):
|
||||||
|
|
Loading…
Reference in New Issue