Merge branch 'master' into production

This commit is contained in:
Achim D. Brucker 2017-06-18 15:38:13 +01:00
commit f95619670c
6 changed files with 397 additions and 200 deletions

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -26,7 +26,7 @@ from random import randint
import datetime
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
from ExtensionCrawler.archive import *
from ExtensionCrawler.archive import archive_file
from ExtensionCrawler.sqlite import *
import dateutil
import dateutil.parser
@ -76,7 +76,7 @@ class RequestResult:
class UpdateResult:
def __init__(self, id, is_new, exception, res_overview, res_crx,
res_reviews, res_support):
res_reviews, res_support,res_sql, sql_update):
self.id = id
self.new = is_new
self.exception = exception
@ -84,6 +84,8 @@ class UpdateResult:
self.res_crx = res_crx
self.res_reviews = res_reviews
self.res_support = res_support
self.res_sql = res_sql
self.sql_update = sql_update
def is_new(self):
return self.new
@ -128,9 +130,11 @@ class UpdateResult:
def corrupt_tar(self):
return self.exception is not None
def get_local_archive_dir(id):
return "{}".format(id[:3])
def sql_exception(self):
return self.res_sql is not None
def sql_success(self):
return self.sql_update
def write_text(tardir, date, fname, text):
@ -262,6 +266,7 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
timeout=10,
allow_redirects=True)
etag = res.headers.get('Etag')
write_text(tmptardir, date, extfilename + ".etag", etag)
logtxt = logmsg(verbose, logtxt, (
" - checking etag, last: {}\n" +
" current: {}\n").format(
@ -287,6 +292,8 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
for chunk in res.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
write_text(tmptardir, date, extfilename + ".etag",
res.headers.get("ETag"))
except Exception as e:
logtxt = logmsg(verbose, logtxt,
" - Exception: {}\n".format(str(e)))
@ -354,6 +361,8 @@ def update_extension(archivedir, verbose, forums, ext_id):
logtxt = logmsg(verbose, "", " Updating {}".format(ext_id))
is_new = False
tar_exception = None
sql_exception = None
sql_success = False
tmptardir = ""
tmptar = ""
@ -380,7 +389,7 @@ def update_extension(archivedir, verbose, forums, ext_id):
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
tar_exception = e
return UpdateResult(ext_id, is_new, tar_exception, res_overview,
res_crx, res_reviews, res_support)
res_crx, res_reviews, res_support, sql_exception, False)
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
ext_id)
@ -443,10 +452,22 @@ def update_extension(archivedir, verbose, forums, ext_id):
except Exception:
pass
msg_updatesqlite = update_sqlite(archivedir, tmptardir, verbose, ext_id,
date)
log(verbose, logtxt + msg_updatesqlite)
try:
sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new,
verbose, 11 * " ")
logtxt = logmsg(verbose, logtxt, msg_updatesqlite)
except Exception as e:
logtxt = logmsg(verbose, logtxt,
" * Exception during update of sqlite db ")
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
sql_exception = e
try:
write_text(tardir, date, ext_id + ".sql.exception", str(e))
except Exception as e:
pass
try:
shutil.rmtree(path=tmpdir)
except Exception as e:
@ -459,11 +480,12 @@ def update_extension(archivedir, verbose, forums, ext_id):
except Exception:
pass
log(verbose, logtxt)
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
res_reviews, res_support)
res_reviews, res_support, sql_exception, sql_success)
def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
def update_extensions(archivedir, verbose, parallel, forums_ext_ids, ext_ids):
ext_with_forums = []
ext_without_forums = []
ext_ids = list(set(ext_ids) - set(forums_ext_ids))
@ -471,7 +493,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
log(verbose, "Updating {} extensions ({} including forums)\n".format(
len(ext_ids), len(forums_ext_ids)))
# First, update extensions with forums sequentially (and with delays) to
# avoid running into Googles DDOS detection.
# avoid running into Googles DDOS detection.
log(verbose,
" Updating {} extensions including forums (sequentially))\n".format(
len(forums_ext_ids)))
@ -486,7 +508,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
log(verbose,
" Updating {} extensions excluding forums (parallel))\n".format(
len(parallel_ids)))
with Pool(12) as p:
with Pool(parallel) as p:
ext_without_forums = list(
p.map(
partial(update_extension, archivedir, verbose, False),
@ -506,5 +528,6 @@ def get_existing_ids(archivedir, verbose):
def get_forum_ext_ids(confdir, verbose):
with open(os.path.join(confdir, "forums.conf")) as f:
ids = f.readlines()
r = re.compile('^[a-p]+$')
ids = [x.strip() for x in ids]
return ids
return list(filter(r.match, ids))

View File

@ -16,6 +16,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
def const_sitemap_url():
return "https://chrome.google.com/webstore/sitemap"
@ -74,3 +76,12 @@ def const_review_payload(ext_id, start, end):
'"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
'"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
end)
def get_local_archive_dir(id):
return "{}".format(id[:3])
def archive_file(archivedir, ext_id):
return os.path.join(
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python3
#
# Copyright (C) 2017 The University of Sheffield, UK
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -18,11 +17,205 @@
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
from ExtensionCrawler.crx import *
from ExtensionCrawler import archive
import sqlite3
import re
from bs4 import BeautifulSoup
from zipfile import ZipFile
import json
import os
import tempfile
import tarfile
import glob
def update_sqlite(archivedir, tmptardir, verbose, ext_id, date):
indent = " "
txt = logmsg(verbose, "", indent + "* Updating SQLite ...")
txt = logmsg(verbose, txt, "")
class SqliteUpdateError(Exception):
def __init__(self, reason="unknown"):
self.reason = reason
def get_etag(ext_id, datepath, con):
#Trying etag file
etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None)
if etagpath:
with open(etagpath) as f:
return f.read()
#Trying to parse header file for etag
headerpath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
if headerpath:
with open(headerpath) as f:
headers = eval(f.read())
if "ETag" in headers:
return headers["ETag"]
#Trying to look up previous etag in database
linkpath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
if linkpath:
with open(linkpath) as f:
link = f.read()
linked_date = link[3:].split("/")[0]
row = next(
con.execute(
"SELECT crx_etag FROM extension WHERE extid=? AND date=?",
(ext_id, linked_date)), None)
if row:
return row[0]
def get_overview_status(datepath):
with open(os.path.join(datepath, "overview.html.status")) as f:
return int(f.read())
def get_crx_status(datepath):
statuspath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None)
if statuspath:
with open(statuspath) as f:
return int(f.read())
def parse_and_insert_overview(ext_id, date, datepath, con):
overview_path = os.path.join(datepath, "overview.html")
with open(overview_path) as overview_file:
contents = overview_file.read()
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
contents)
name = match.group(1) if match else None
# Extract extension version
match = re.search("""<meta itemprop="version" content="(.*?)"\s*/>""",
contents)
version = match.group(1) if match else None
# Extracts extension categories
match = re.search("""Attribute name="category">(.+?)</Attribute>""",
contents)
categories = match.group(1).split(",") if match else None
# Extracts the number of downloads
match = re.search("""user_count.*?(\d+)""", contents)
downloads = int(match.group(1)) if match else None
# Extracts the full extension description as it appears on the overview page
doc = BeautifulSoup(contents, 'html.parser')
description_parent = doc.find('div', itemprop="description")
description = str(description_parent.contents[
0]) if description_parent and description_parent.contents else None
full_description = str(
description_parent.parent) if description_parent else None
developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls)
developer = str(
developer_parent.contents[0]) if developer_parent else None
last_updated_parent = doc.find(
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
last_updated = str(
last_updated_parent.contents[0]) if last_updated_parent else None
etag = get_etag(ext_id, datepath, con)
overview_status = get_overview_status(datepath)
crx_status = get_crx_status(datepath)
con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
(ext_id, date, name, version, description, downloads,
full_description, developer, etag, last_updated,
overview_status, crx_status))
if categories:
for category in categories:
con.execute("INSERT INTO category VALUES (?,?,?)",
(ext_id, date, category))
def parse_and_insert_crx(ext_id, date, datepath, con):
etag = get_etag(ext_id, datepath, con)
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
filename = os.path.basename(crx_path)
with ZipFile(crx_path) as f:
with f.open("manifest.json") as m:
try:
# There are some manifests that seem to have weird encodings...
manifest = json.loads(m.read().decode("utf-8-sig"))
if "permissions" in manifest:
for permission in manifest["permissions"]:
con.execute(
"INSERT OR REPLACE INTO permission VALUES (?,?)",
(etag, str(permission)))
except json.decoder.JSONDecodeError:
pass
public_key = read_crx(crx_path).pk
con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
public_key))
def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose,
indent):
txt = ""
txt = logmsg(verbose, txt,
indent + "- updating using {}\n".format(datepath))
if not os.path.exists(db_path):
raise SqliteUpdateError("db file not found")
with sqlite3.connect(db_path) as con:
parse_and_insert_overview(ext_id, date, datepath, con)
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
etag = get_etag(ext_id, datepath, con)
etag_already_in_db = next(
con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
0]
if etag and not etag_already_in_db:
if crx_path:
parse_and_insert_crx(ext_id, date, datepath, con)
else:
raise SqliteUpdateError(
"etag not in db and no crx file present")
return txt
def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent):
update_successful = False
txt = ""
indent2 = indent + 4 * " "
datepath = os.path.join(tmptardir, date)
txt = logmsg(verbose, txt,
indent + "* extracting information into SQLite db...\n")
db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite")
txt = logmsg(verbose, txt,
indent2 + "- attempting incremental update...\n")
try:
updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date,
verbose, indent2)
txt = logmsg(verbose, txt, updatetxt)
update_successful = True
except SqliteUpdateError as e:
txt = logmsg(
verbose, txt,
indent2 + "- incremental update failed: {}\n".format(e.reason))
return update_successful, txt

35
crawler
View File

@ -33,6 +33,9 @@ import dateutil.parser
import time
import getopt
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
def write_log(dir, fname, text):
os.makedirs(dir, exist_ok=True)
@ -78,6 +81,18 @@ def log_failures_to_file(dir, today, res):
sorted(map(lambda x: x.id, filter(lambda x: x.corrupt_tar(), res))),
"")
write_log(dir, today + "-file-corruption.log", file_corruption)
sql_exception = reduce(
lambda x, y: x + "\n" + y,
sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))),
"")
write_log(dir, today + "-sql-exception.log", sql_exception)
sql_success = reduce(
lambda x, y: x + "\n" + y,
sorted(map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))),
"")
write_log(dir, today + "-sql-not-updated.log", sql_success)
def log_summary(verbose, res, stderr=False, runtime=0):
@ -95,6 +110,8 @@ def log_summary(verbose, res, stderr=False, runtime=0):
not_in_store = len(list(filter(lambda x: x.not_in_store(), res)))
not_modified = len(list(filter(lambda x: x.not_modified(), res)))
corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
sql_exception = len(list(filter(lambda x: x.sql_exception(), res)))
sql_success = len(list(filter(lambda x: x.sql_success(), res)))
new = len(list(filter(lambda x: x.is_new(), res)))
updated = len(
@ -105,6 +122,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
p(" Updated {} out of {} extensions successfully\n".format(
str(success), str(total)))
p(" Updated extensions: {:8d}\n".format(updated))
p(" Updated SQL databases: {:8d}\n".format(sql_success))
p(" New extensions: {:8d}\n".format(new))
p(" Not authorized: {:8d}\n".format(not_authorized))
p(" Raised Google DDOS: {:8d}\n".format(raised_ddos))
@ -112,6 +130,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
p(" Extensions not in store: {:8d}\n".format(not_in_store))
p(" Unknown exception: {:8d}\n".format(has_exception))
p(" Corrupt tar archives: {:8d}\n".format(len(corrupt_tar_archives)))
p(" SQL exception: {:8d}\n".format(sql_exception))
p(" Total runtime: {}\n".format(
str(datetime.timedelta(seconds=int(runtime)))))
@ -135,10 +154,11 @@ def help():
def main(argv):
today = datetime.datetime.now(datetime.timezone.utc).isoformat()
basedir = "archive"
parallel = 24
verbose = True
discover = False
try:
opts, args = getopt.getopt(argv, "hsda:", ["archive="])
opts, args = getopt.getopt(argv, "hsdap:", ["archive=",'parallel='])
except getopt.GetoptError:
help()
sys.exit(2)
@ -148,6 +168,8 @@ def main(argv):
sys.exit()
elif opt in ("-a", "--archive"):
basedir = arg
elif opt in ("-p", "--parallel"):
parallel = int(arg)
elif opt == '-s':
verbose = False
elif opt == '-d':
@ -164,10 +186,11 @@ def main(argv):
start_time = time.time()
log(verbose, "Configuration:\n")
log(verbose, " Base dir: {}\n".format(basedir))
log(verbose, " Archive dir: {}\n".format(archive_dir))
log(verbose, " Conf. dir: {}\n".format(conf_dir))
log(verbose, " Discover new ext.: {}\n".format(discover))
log(verbose, " Base dir: {}\n".format(basedir))
log(verbose, " Archive directory: {}\n".format(archive_dir))
log(verbose, " Configuration directory: {}\n".format(conf_dir))
log(verbose, " Discover new extensions: {}\n".format(discover))
log(verbose, " Max num. of concurrent downloads: {}\n".format(parallel))
log(verbose, "\n")
forum_ext_ids = get_forum_ext_ids(conf_dir, verbose)
@ -178,7 +201,7 @@ def main(argv):
discovered_ids = get_new_ids(verbose, known_ids)
ext_ids = list(set(discovered_ids) | set(known_ids))
res = update_extensions(archive_dir, verbose, forum_ext_ids, ext_ids)
res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, ext_ids)
# We re-try (once) the extensions with unknown exceptions, as
# they are often temporary

120
create_db Executable file
View File

@ -0,0 +1,120 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import glob
import tarfile
import tempfile
from ExtensionCrawler.sqlite import *
def setup_tables(con):
con.execute("""CREATE TABLE review ("""
"""id INTEGER PRIMARY KEY,"""
"""extid TEXT,"""
"""date TEXT,"""
"""user TEXT,"""
"""reviewdate TEXT,"""
"""rating TEXT,"""
"""comment TEXT"""
""")""")
con.execute("""CREATE TABLE category ("""
"""extid TEXT,"""
"""date TEXT,"""
"""category TEXT,"""
"""PRIMARY KEY (extid, date, category)"""
""")""")
con.execute("""CREATE TABLE permission ("""
"""crx_etag TEXT,"""
"""permission TEXT,"""
"""PRIMARY KEY (crx_etag, permission)"""
""")""")
con.execute("""CREATE TABLE crx ("""
"""etag TEXT PRIMARY KEY,"""
"""filename TEXT,"""
"""publickey BLOB"""
""")""")
con.execute("""CREATE TABLE extension ("""
"""extid TEXT,"""
"""date TEXT,"""
"""name TEXT,"""
"""version TEXT,"""
"""description TEXT,"""
"""downloads INTEGER,"""
"""fulldescription TEXT,"""
"""developer TEXT,"""
"""crx_etag TEXT,"""
"""lastupdated TEXT,"""
"""crx_status INTEGER,"""
"""overview_status INTEGER,"""
"""PRIMARY KEY (extid, date),"""
"""FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
""")""")
def help():
print("create_db [OPTION]")
print(" -h print this help text")
print(" -a=<DIR> archive directory")
print(" -p=<PREFIX> three-letter-prefix")
def main(argv):
basedir = "archive"
prefix = ""
try:
opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="])
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
basedir = arg
elif opt in ("-p", "--prefix"):
prefix = arg
archive_dir = os.path.join(basedir, "data")
threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*"))
for threeletterdir in threeletterdirs:
for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]):
tarpath = os.path.join(threeletterdir, ext_id + ".tar")
dbpath = os.path.join(threeletterdir, ext_id + ".sqlite")
if os.path.exists(dbpath):
os.remove(dbpath)
with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(tarpath) as t:
t.extractall(tmpdir)
iddir = os.path.join(tmpdir, ext_id)
with sqlite3.connect(dbpath) as con:
setup_tables(con)
for date in sorted(os.listdir(iddir)):
datepath = os.path.join(iddir, date)
print(
update_sqlite_incremental(dbpath, datepath, ext_id,
date, True, ""))
if __name__ == "__main__":
main(sys.argv[1:])

View File

@ -1,173 +0,0 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from zipfile import ZipFile
import argparse
import json
import sys
import os
from jsmin import jsmin
import re
regex_concrete_url = re.compile(r'^.*://.*[a-z0-9]+\.[a-z]+.*$')
class PermissionHandlerPrintNames:
def __init__(self, permname):
self.permname = permname
self.extinfo = {}
def handle_permission(self, extid, permobj, path):
if self.permname in str(permobj):
with open(os.path.join(path, 'metadata.json')) as f:
metadata = json.load(f)
self.extinfo[extid] = '{} | {} | {}'.format(metadata[1],
metadata[6], path)
def print_result(self, fileobj, delim):
fileobj.write('Extensions that use permission "{}":\n\n'.format(
self.permname))
for extid in self.extinfo:
fileobj.write('{}\n'.format(self.extinfo[extid]))
fileobj.write('\n\n')
class PermissionHandler:
def __init__(self):
self.permissions = {}
self.extids = set()
def handle_permission(self, extid, permobj, path):
self.extids.add(extid)
perm = str(permobj)
if not perm in self.permissions:
self.permissions[perm] = 0
self.permissions[perm] += 1
def print_result(self, fileobj, delim):
fileobj.write('Total: {} extensions\n'.format(len(self.extids)))
for perm in sorted(
self.permissions, key=self.permissions.get, reverse=True):
fileobj.write('{}{}{}{}{:.2%}\n'.format(
perm, delim, self.permissions[perm], delim,
float(self.permissions[perm]) / len(self.extids)))
fileobj.write('\n\n')
class PermissionHandlerCondensed:
def __init__(self):
self.permissions = {}
self.extids = set()
self.exts_with_concrete_urls = set()
def handle_permission(self, extid, permobj, path):
self.extids.add(extid)
perm = str(permobj)
if regex_concrete_url.match(perm):
if extid in self.exts_with_concrete_urls:
return
self.exts_with_concrete_urls.add(extid)
perm = '<<<{}>>>'.format(regex_concrete_url.pattern)
if not perm in self.permissions:
self.permissions[perm] = 0
self.permissions[perm] += 1
def print_result(self, fileobj, delim):
fileobj.write('Condensed. Total: {} extensions\n'.format(
len(self.extids)))
for perm in sorted(
self.permissions, key=self.permissions.get, reverse=True):
fileobj.write('{}{}{}{}{:.2%}\n'.format(
perm, delim, self.permissions[perm], delim,
float(self.permissions[perm]) / len(self.extids)))
fileobj.write('\n\n')
class PermissionStatisticGenerator:
def run(category_folder, permhandlers):
for root, dirs, files in os.walk(category_folder):
crxfile = next((f for f in files if f.endswith('.crx')), None)
if crxfile:
extid = os.path.basename(root)
with ZipFile(os.path.join(root, crxfile)) as zipfile:
with zipfile.open('manifest.json') as f:
content = jsmin(f.read().decode())
# This is needed to strip weird BOMs ...
first_bracket = content.find('{')
if first_bracket >= 0:
content = content[first_bracket:]
manifest = json.loads(content)
if 'permissions' in manifest:
for permobj in manifest['permissions']:
for handler in permhandlers:
handler.handle_permission(extid, permobj,
root)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Prints statistics about the requested permissions of downloaded extensions.'
)
parser.add_argument(
'dir',
help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.'
)
parser.add_argument(
'-d',
'--delim',
default='\t',
help='Delimiter used for the statistics output.')
parser.add_argument(
'-o',
'--output',
default=sys.stdout,
type=argparse.FileType('w'),
help='Save the statistics into a file.')
parser.add_argument(
'-p',
'--permission',
help='Prints out all extension names and descriptions that use the given permission.'
)
parser.add_argument(
'-c',
'--categories',
action='store_true',
help='Print the results for each category separately.')
args = parser.parse_args()
category_folders = [args.dir]
if args.categories:
category_folders += [
os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1]
]
for category_folder in category_folders:
args.output.write('Results for category {}:\n\n'.format(
category_folder))
if args.permission:
handlers = [PermissionHandlerPrintNames(args.permission)]
else:
handlers = [PermissionHandler(), PermissionHandlerCondensed()]
PermissionStatisticGenerator.run(category_folder, handlers)
for handler in handlers:
handler.print_result(args.output, args.delim)