173 lines
5.8 KiB
Python
Executable File
173 lines
5.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
import getopt
|
|
import os
|
|
import sys
|
|
import glob
|
|
import tarfile
|
|
import tempfile
|
|
import traceback
|
|
import jsbeautifier
|
|
from multiprocessing import Pool
|
|
from zipfile import ZipFile
|
|
from functools import partial
|
|
import re
|
|
|
|
from ExtensionCrawler.config import *
|
|
from ExtensionCrawler.util import *
|
|
|
|
|
|
def get_name(overview_path):
|
|
if os.path.exists(overview_path):
|
|
with open(overview_path) as overview_file:
|
|
contents = overview_file.read()
|
|
|
|
# Extract extension name
|
|
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
|
contents)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
|
|
def get_downloads(overview_path):
|
|
if os.path.exists(overview_path):
|
|
with open(overview_path) as overview_file:
|
|
contents = overview_file.read()
|
|
|
|
match = re.search(
|
|
"""<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
|
|
contents)
|
|
if match:
|
|
return int(match.group(1).replace(",", ''))
|
|
|
|
|
|
def help():
|
|
print("grepper [OPTION] BASEDIR GREP1 [GREP2...]")
|
|
print(" GREP1 [GREP2...] regex patterns")
|
|
print(" BASEDIR directory for output")
|
|
print(" -h print this help text")
|
|
print(" -a=<DIR> archive directory")
|
|
print(" -p=<PREFIX> three-letter-prefix")
|
|
print(" -t=<THREADS> number of parallel threads")
|
|
|
|
|
|
def process_date(extdir, ext_id, date, greps, out_f):
|
|
overview_path = os.path.join(extdir, "overview.html")
|
|
crxpath = next(iter(glob.glob(os.path.join(extdir, "*.crx"))), None)
|
|
if crxpath and os.path.getsize(crxpath) > 0:
|
|
contents = {}
|
|
with ZipFile(crxpath) as f:
|
|
for in_zip_f in f.infolist():
|
|
if not in_zip_f.filename.endswith(".js"):
|
|
continue
|
|
with f.open(in_zip_f) as f2:
|
|
content = f2.read().decode(errors="surrogateescape")
|
|
contents[in_zip_f.filename] = jsbeautifier.beautify(
|
|
content)
|
|
jslocs = sum(
|
|
[len(content.splitlines()) for content in contents.values()])
|
|
name = get_name(overview_path)
|
|
downloads = get_downloads(overview_path)
|
|
for filename, content in contents.items():
|
|
for i, line in enumerate(content.splitlines()):
|
|
for gr in greps:
|
|
if re.search(gr, line):
|
|
args = [
|
|
ext_id, date,
|
|
name.replace("|", "<PIPE>") if name is not None else "",
|
|
str(downloads) if downloads is not None else "",
|
|
str(jslocs), gr,
|
|
filename + " (line " + str(i + 1) + ")",
|
|
line
|
|
]
|
|
print("|".join(args), file=out_f)
|
|
|
|
|
|
def process_id(archivedir, outdir, greps, verbose, ext_id):
|
|
txt = ""
|
|
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
|
|
|
|
tarpath = archive_file(archivedir, ext_id)
|
|
greppath = os.path.join(outdir, ext_id + ".grep")
|
|
if os.path.exists(greppath):
|
|
os.remove(greppath)
|
|
with open(greppath, "w") as out_f:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
with tarfile.open(tarpath) as t:
|
|
t.extractall(tmpdir)
|
|
iddir = os.path.join(tmpdir, ext_id)
|
|
|
|
for date in sorted(os.listdir(iddir)):
|
|
try:
|
|
process_date(
|
|
os.path.join(iddir, date), ext_id, date, greps,
|
|
out_f)
|
|
except Exception:
|
|
txt = logmsg(
|
|
verbose, txt,
|
|
"Exception when handling {} on {}:\n".format(
|
|
ext_id, date))
|
|
txt = logmsg(verbose, txt, traceback.format_exc())
|
|
|
|
return txt
|
|
|
|
|
|
def main(argv):
|
|
archive = "archive"
|
|
prefix = ""
|
|
parallel = 8
|
|
try:
|
|
opts, args = getopt.getopt(argv, "ha:p:t:",
|
|
["archive=", "prefix=", "threads="])
|
|
except getopt.GetoptError:
|
|
help()
|
|
sys.exit(2)
|
|
for opt, arg in opts:
|
|
if opt == '-h':
|
|
help()
|
|
sys.exit()
|
|
elif opt in ("-a", "--archive"):
|
|
archive = arg
|
|
elif opt in ("-p", "--prefix"):
|
|
prefix = arg
|
|
elif opt in ("-t", "--threads"):
|
|
parallel = int(arg)
|
|
|
|
if len(args) < 2:
|
|
help()
|
|
sys.exit(2)
|
|
|
|
outdir = args[0]
|
|
greps = args[1:]
|
|
|
|
archivedir = os.path.join(archive, "data")
|
|
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
|
|
for threeletterdir in threeletterdirs:
|
|
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
|
|
with Pool(parallel) as p:
|
|
for txt in p.imap(
|
|
partial(process_id, archivedir, outdir, greps, True),
|
|
ext_ids):
|
|
sys.stdout.write(txt)
|
|
sys.stdout.flush()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(sys.argv[1:])
|