ExtensionCrawler/grepper

183 lines
6.8 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import glob
import tarfile
import tempfile
import traceback
import jsbeautifier
from multiprocessing import Pool
from zipfile import ZipFile
from ExtensionCrawler.sqlite import *
from ExtensionCrawler.config import *
def get_name(overview_path):
if os.path.exists(overview_path):
with open(overview_path) as overview_file:
contents = overview_file.read()
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
contents)
if match:
return match.group(1)
def get_downloads(overview_path):
if os.path.exists(overview_path):
with open(overview_path) as overview_file:
contents = overview_file.read()
match = re.search(
"""<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
contents)
if match:
return int(match.group(1).replace(",", ''))
def get_jsloc(f):
jsloc = 0
jsfiles = filter(lambda x: x.filename.endswith(".js"), f.infolist())
for jsfile in jsfiles:
with f.open(jsfile) as jsf:
content = jsf.read().decode(errors="surrogateescape")
beautified = jsbeautifier.beautify(content)
lines = beautified.splitlines()
jsloc += len(lines)
return jsloc
def help():
print("grepper [OPTION] BASEDIR GREP1 [GREP2...]")
print(" GREP1 [GREP2...] regex patterns")
print(" BASEDIR directory for output")
print(" -h print this help text")
print(" -a=<DIR> archive directory")
print(" -p=<PREFIX> three-letter-prefix")
print(" -t=<THREADS> number of parallel threads")
def process_id(archivedir, outdir, greps, verbose, ext_id):
txt = ""
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
tarpath = archive_file(archivedir, ext_id)
greppath = os.path.join(outdir, ext_id + ".grep")
if os.path.exists(greppath):
os.remove(greppath)
with open(greppath, "w") as out_f:
with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(tarpath) as t:
t.extractall(tmpdir)
iddir = os.path.join(tmpdir, ext_id)
for date in sorted(os.listdir(iddir)):
overview_path = os.path.join(iddir, date, "overview.html")
name = get_name(overview_path)
downloads = get_downloads(overview_path)
try:
crxpath = next(
iter(
glob.glob(os.path.join(iddir, date, "*.crx"))),
None)
if crxpath and os.path.getsize(crxpath) > 0:
with ZipFile(crxpath) as f:
jsloc = get_jsloc(f)
for in_zip_f in f.infolist():
if not in_zip_f.filename.endswith(".js"):
continue
with f.open(in_zip_f) as f2:
content = f2.read().decode(
errors="surrogateescape")
content = jsbeautifier.beautify(
content)
for i, line in enumerate(
content.splitlines()):
for gr in greps:
if re.search(gr, line):
print(
"|".join([
ext_id, date,
name.replace(
"|", "<PIPE>"),
str(downloads),
str(jsloc), gr,
in_zip_f.filename +
" (line " +
str(i + 1) + ")",
line
]),
file=out_f)
except Exception:
txt = logmsg(
verbose, txt,
"Exception when handling {} on {}:\n".format(
ext_id, date))
txt = logmsg(verbose, txt, traceback.format_exc())
return txt
def main(argv):
archive = "archive"
prefix = ""
parallel = 8
try:
opts, args = getopt.getopt(argv, "ha:p:t:",
["archive=", "prefix=", "threads="])
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
elif opt in ("-p", "--prefix"):
prefix = arg
elif opt in ("-t", "--threads"):
parallel = int(arg)
if len(args) < 2:
help()
sys.exit(2)
outdir = args[0]
greps = args[1:]
archivedir = os.path.join(archive, "data")
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
for threeletterdir in threeletterdirs:
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
with Pool(parallel) as p:
for txt in p.imap(
partial(process_id, archivedir, outdir, greps, True),
ext_ids):
sys.stdout.write(txt)
sys.stdout.flush()
if __name__ == "__main__":
main(sys.argv[1:])