#!/usr/bin/env python3 # # Copyright (C) 2016,2017 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # import getopt import os import sys import glob import tarfile import tempfile import traceback import jsbeautifier from multiprocessing import Pool from zipfile import ZipFile from functools import partial import re from ExtensionCrawler.config import * from ExtensionCrawler.util import * def get_name(overview_path): if os.path.exists(overview_path): with open(overview_path) as overview_file: contents = overview_file.read() # Extract extension name match = re.search("""""", contents) if match: return match.group(1) def get_downloads(overview_path): if os.path.exists(overview_path): with open(overview_path) as overview_file: contents = overview_file.read() match = re.search( """ archive directory") print(" -p= three-letter-prefix") print(" -t= number of parallel threads") def process_date(extdir, ext_id, date, greps, out_f): overview_path = os.path.join(extdir, "overview.html") crxpath = next(iter(glob.glob(os.path.join(extdir, "*.crx"))), None) if crxpath and os.path.getsize(crxpath) > 0: contents = {} with ZipFile(crxpath) as f: for in_zip_f in f.infolist(): if not in_zip_f.filename.endswith(".js"): continue with f.open(in_zip_f) as f2: content = f2.read().decode(errors="surrogateescape") contents[in_zip_f.filename] = jsbeautifier.beautify( content) jslocs = sum( [len(content.splitlines()) for content in contents.values()]) name = get_name(overview_path) downloads = get_downloads(overview_path) for filename, content in contents.items(): for i, line in enumerate(content.splitlines()): for gr in greps: if re.search(gr, line): args = [ ext_id, date, name.replace("|", "") if name is not None else "", str(downloads) if downloads is not None else "", str(jslocs), gr, filename + " (line " + str(i + 1) + ")", line ] print("|".join(args), file=out_f) def process_id(archivedir, outdir, greps, verbose, ext_id): txt = "" txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id)) tarpath = archive_file(archivedir, ext_id) greppath = os.path.join(outdir, ext_id + ".grep") if os.path.exists(greppath): os.remove(greppath) with open(greppath, "w") as out_f: with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(tarpath) as t: t.extractall(tmpdir) iddir = os.path.join(tmpdir, ext_id) for date in sorted(os.listdir(iddir)): try: process_date( os.path.join(iddir, date), ext_id, date, greps, out_f) except Exception: txt = logmsg( verbose, txt, "Exception when handling {} on {}:\n".format( ext_id, date)) txt = logmsg(verbose, txt, traceback.format_exc()) return txt def main(argv): archive = "archive" prefix = "" parallel = 8 try: opts, args = getopt.getopt(argv, "ha:p:t:", ["archive=", "prefix=", "threads="]) except getopt.GetoptError: help() sys.exit(2) for opt, arg in opts: if opt == '-h': help() sys.exit() elif opt in ("-a", "--archive"): archive = arg elif opt in ("-p", "--prefix"): prefix = arg elif opt in ("-t", "--threads"): parallel = int(arg) if len(args) < 2: help() sys.exit(2) outdir = args[0] greps = args[1:] archivedir = os.path.join(archive, "data") threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*")) for threeletterdir in threeletterdirs: ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)])) with Pool(parallel) as p: for txt in p.imap( partial(process_id, archivedir, outdir, greps, True), ext_ids): sys.stdout.write(txt) sys.stdout.flush() if __name__ == "__main__": main(sys.argv[1:])