Merge branch 'master' into production
This commit is contained in:
commit
f95619670c
|
@ -26,7 +26,7 @@ from random import randint
|
|||
import datetime
|
||||
from ExtensionCrawler.config import *
|
||||
from ExtensionCrawler.util import *
|
||||
from ExtensionCrawler.archive import *
|
||||
from ExtensionCrawler.archive import archive_file
|
||||
from ExtensionCrawler.sqlite import *
|
||||
import dateutil
|
||||
import dateutil.parser
|
||||
|
@ -76,7 +76,7 @@ class RequestResult:
|
|||
|
||||
class UpdateResult:
|
||||
def __init__(self, id, is_new, exception, res_overview, res_crx,
|
||||
res_reviews, res_support):
|
||||
res_reviews, res_support,res_sql, sql_update):
|
||||
self.id = id
|
||||
self.new = is_new
|
||||
self.exception = exception
|
||||
|
@ -84,6 +84,8 @@ class UpdateResult:
|
|||
self.res_crx = res_crx
|
||||
self.res_reviews = res_reviews
|
||||
self.res_support = res_support
|
||||
self.res_sql = res_sql
|
||||
self.sql_update = sql_update
|
||||
|
||||
def is_new(self):
|
||||
return self.new
|
||||
|
@ -128,9 +130,11 @@ class UpdateResult:
|
|||
def corrupt_tar(self):
|
||||
return self.exception is not None
|
||||
|
||||
def sql_exception(self):
|
||||
return self.res_sql is not None
|
||||
|
||||
def get_local_archive_dir(id):
|
||||
return "{}".format(id[:3])
|
||||
def sql_success(self):
|
||||
return self.sql_update
|
||||
|
||||
|
||||
def write_text(tardir, date, fname, text):
|
||||
|
@ -262,6 +266,7 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
|
|||
timeout=10,
|
||||
allow_redirects=True)
|
||||
etag = res.headers.get('Etag')
|
||||
write_text(tmptardir, date, extfilename + ".etag", etag)
|
||||
logtxt = logmsg(verbose, logtxt, (
|
||||
" - checking etag, last: {}\n" +
|
||||
" current: {}\n").format(
|
||||
|
@ -287,6 +292,8 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
|
|||
for chunk in res.iter_content(chunk_size=512 * 1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
f.write(chunk)
|
||||
write_text(tmptardir, date, extfilename + ".etag",
|
||||
res.headers.get("ETag"))
|
||||
except Exception as e:
|
||||
logtxt = logmsg(verbose, logtxt,
|
||||
" - Exception: {}\n".format(str(e)))
|
||||
|
@ -354,6 +361,8 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
|||
logtxt = logmsg(verbose, "", " Updating {}".format(ext_id))
|
||||
is_new = False
|
||||
tar_exception = None
|
||||
sql_exception = None
|
||||
sql_success = False
|
||||
tmptardir = ""
|
||||
tmptar = ""
|
||||
|
||||
|
@ -380,7 +389,7 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
|||
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
|
||||
tar_exception = e
|
||||
return UpdateResult(ext_id, is_new, tar_exception, res_overview,
|
||||
res_crx, res_reviews, res_support)
|
||||
res_crx, res_reviews, res_support, sql_exception, False)
|
||||
|
||||
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
|
||||
ext_id)
|
||||
|
@ -443,10 +452,22 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
msg_updatesqlite = update_sqlite(archivedir, tmptardir, verbose, ext_id,
|
||||
date)
|
||||
log(verbose, logtxt + msg_updatesqlite)
|
||||
try:
|
||||
sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new,
|
||||
verbose, 11 * " ")
|
||||
logtxt = logmsg(verbose, logtxt, msg_updatesqlite)
|
||||
|
||||
except Exception as e:
|
||||
logtxt = logmsg(verbose, logtxt,
|
||||
" * Exception during update of sqlite db ")
|
||||
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
|
||||
|
||||
sql_exception = e
|
||||
|
||||
try:
|
||||
write_text(tardir, date, ext_id + ".sql.exception", str(e))
|
||||
except Exception as e:
|
||||
pass
|
||||
try:
|
||||
shutil.rmtree(path=tmpdir)
|
||||
except Exception as e:
|
||||
|
@ -459,11 +480,12 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
log(verbose, logtxt)
|
||||
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
|
||||
res_reviews, res_support)
|
||||
res_reviews, res_support, sql_exception, sql_success)
|
||||
|
||||
|
||||
def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
|
||||
def update_extensions(archivedir, verbose, parallel, forums_ext_ids, ext_ids):
|
||||
ext_with_forums = []
|
||||
ext_without_forums = []
|
||||
ext_ids = list(set(ext_ids) - set(forums_ext_ids))
|
||||
|
@ -486,7 +508,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
|
|||
log(verbose,
|
||||
" Updating {} extensions excluding forums (parallel))\n".format(
|
||||
len(parallel_ids)))
|
||||
with Pool(12) as p:
|
||||
with Pool(parallel) as p:
|
||||
ext_without_forums = list(
|
||||
p.map(
|
||||
partial(update_extension, archivedir, verbose, False),
|
||||
|
@ -506,5 +528,6 @@ def get_existing_ids(archivedir, verbose):
|
|||
def get_forum_ext_ids(confdir, verbose):
|
||||
with open(os.path.join(confdir, "forums.conf")) as f:
|
||||
ids = f.readlines()
|
||||
r = re.compile('^[a-p]+$')
|
||||
ids = [x.strip() for x in ids]
|
||||
return ids
|
||||
return list(filter(r.match, ids))
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
|
||||
def const_sitemap_url():
|
||||
return "https://chrome.google.com/webstore/sitemap"
|
||||
|
@ -74,3 +76,12 @@ def const_review_payload(ext_id, start, end):
|
|||
'"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
|
||||
'"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
|
||||
end)
|
||||
|
||||
|
||||
def get_local_archive_dir(id):
|
||||
return "{}".format(id[:3])
|
||||
|
||||
|
||||
def archive_file(archivedir, ext_id):
|
||||
return os.path.join(
|
||||
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (C) 2017 The University of Sheffield, UK
|
||||
#
|
||||
|
@ -18,11 +17,205 @@
|
|||
|
||||
from ExtensionCrawler.config import *
|
||||
from ExtensionCrawler.util import *
|
||||
from ExtensionCrawler.crx import *
|
||||
|
||||
from ExtensionCrawler import archive
|
||||
|
||||
import sqlite3
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from zipfile import ZipFile
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import tarfile
|
||||
import glob
|
||||
|
||||
|
||||
def update_sqlite(archivedir, tmptardir, verbose, ext_id, date):
|
||||
indent = " "
|
||||
txt = logmsg(verbose, "", indent + "* Updating SQLite ...")
|
||||
txt = logmsg(verbose, txt, "")
|
||||
class SqliteUpdateError(Exception):
|
||||
def __init__(self, reason="unknown"):
|
||||
self.reason = reason
|
||||
|
||||
|
||||
def get_etag(ext_id, datepath, con):
|
||||
#Trying etag file
|
||||
etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None)
|
||||
if etagpath:
|
||||
with open(etagpath) as f:
|
||||
return f.read()
|
||||
|
||||
#Trying to parse header file for etag
|
||||
headerpath = next(
|
||||
iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
|
||||
if headerpath:
|
||||
with open(headerpath) as f:
|
||||
headers = eval(f.read())
|
||||
if "ETag" in headers:
|
||||
return headers["ETag"]
|
||||
|
||||
#Trying to look up previous etag in database
|
||||
linkpath = next(
|
||||
iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
|
||||
if linkpath:
|
||||
with open(linkpath) as f:
|
||||
link = f.read()
|
||||
linked_date = link[3:].split("/")[0]
|
||||
|
||||
row = next(
|
||||
con.execute(
|
||||
"SELECT crx_etag FROM extension WHERE extid=? AND date=?",
|
||||
(ext_id, linked_date)), None)
|
||||
if row:
|
||||
return row[0]
|
||||
|
||||
|
||||
def get_overview_status(datepath):
|
||||
with open(os.path.join(datepath, "overview.html.status")) as f:
|
||||
return int(f.read())
|
||||
|
||||
|
||||
def get_crx_status(datepath):
|
||||
statuspath = next(
|
||||
iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None)
|
||||
if statuspath:
|
||||
with open(statuspath) as f:
|
||||
return int(f.read())
|
||||
|
||||
|
||||
def parse_and_insert_overview(ext_id, date, datepath, con):
|
||||
overview_path = os.path.join(datepath, "overview.html")
|
||||
with open(overview_path) as overview_file:
|
||||
contents = overview_file.read()
|
||||
|
||||
# Extract extension name
|
||||
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
||||
contents)
|
||||
name = match.group(1) if match else None
|
||||
|
||||
# Extract extension version
|
||||
match = re.search("""<meta itemprop="version" content="(.*?)"\s*/>""",
|
||||
contents)
|
||||
version = match.group(1) if match else None
|
||||
|
||||
# Extracts extension categories
|
||||
match = re.search("""Attribute name="category">(.+?)</Attribute>""",
|
||||
contents)
|
||||
categories = match.group(1).split(",") if match else None
|
||||
|
||||
# Extracts the number of downloads
|
||||
match = re.search("""user_count.*?(\d+)""", contents)
|
||||
downloads = int(match.group(1)) if match else None
|
||||
|
||||
# Extracts the full extension description as it appears on the overview page
|
||||
doc = BeautifulSoup(contents, 'html.parser')
|
||||
|
||||
description_parent = doc.find('div', itemprop="description")
|
||||
description = str(description_parent.contents[
|
||||
0]) if description_parent and description_parent.contents else None
|
||||
full_description = str(
|
||||
description_parent.parent) if description_parent else None
|
||||
|
||||
developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls)
|
||||
developer = str(
|
||||
developer_parent.contents[0]) if developer_parent else None
|
||||
|
||||
last_updated_parent = doc.find(
|
||||
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
|
||||
last_updated = str(
|
||||
last_updated_parent.contents[0]) if last_updated_parent else None
|
||||
|
||||
etag = get_etag(ext_id, datepath, con)
|
||||
|
||||
overview_status = get_overview_status(datepath)
|
||||
|
||||
crx_status = get_crx_status(datepath)
|
||||
|
||||
con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(ext_id, date, name, version, description, downloads,
|
||||
full_description, developer, etag, last_updated,
|
||||
overview_status, crx_status))
|
||||
|
||||
if categories:
|
||||
for category in categories:
|
||||
con.execute("INSERT INTO category VALUES (?,?,?)",
|
||||
(ext_id, date, category))
|
||||
|
||||
|
||||
def parse_and_insert_crx(ext_id, date, datepath, con):
|
||||
etag = get_etag(ext_id, datepath, con)
|
||||
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
|
||||
filename = os.path.basename(crx_path)
|
||||
|
||||
with ZipFile(crx_path) as f:
|
||||
with f.open("manifest.json") as m:
|
||||
try:
|
||||
# There are some manifests that seem to have weird encodings...
|
||||
manifest = json.loads(m.read().decode("utf-8-sig"))
|
||||
if "permissions" in manifest:
|
||||
for permission in manifest["permissions"]:
|
||||
con.execute(
|
||||
"INSERT OR REPLACE INTO permission VALUES (?,?)",
|
||||
(etag, str(permission)))
|
||||
except json.decoder.JSONDecodeError:
|
||||
pass
|
||||
|
||||
public_key = read_crx(crx_path).pk
|
||||
|
||||
con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
|
||||
public_key))
|
||||
|
||||
|
||||
def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose,
|
||||
indent):
|
||||
txt = ""
|
||||
|
||||
txt = logmsg(verbose, txt,
|
||||
indent + "- updating using {}\n".format(datepath))
|
||||
|
||||
if not os.path.exists(db_path):
|
||||
raise SqliteUpdateError("db file not found")
|
||||
|
||||
with sqlite3.connect(db_path) as con:
|
||||
parse_and_insert_overview(ext_id, date, datepath, con)
|
||||
|
||||
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
|
||||
|
||||
etag = get_etag(ext_id, datepath, con)
|
||||
etag_already_in_db = next(
|
||||
con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
|
||||
0]
|
||||
if etag and not etag_already_in_db:
|
||||
if crx_path:
|
||||
parse_and_insert_crx(ext_id, date, datepath, con)
|
||||
else:
|
||||
raise SqliteUpdateError(
|
||||
"etag not in db and no crx file present")
|
||||
|
||||
return txt
|
||||
|
||||
|
||||
def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent):
|
||||
update_successful = False
|
||||
txt = ""
|
||||
indent2 = indent + 4 * " "
|
||||
|
||||
datepath = os.path.join(tmptardir, date)
|
||||
|
||||
txt = logmsg(verbose, txt,
|
||||
indent + "* extracting information into SQLite db...\n")
|
||||
|
||||
db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite")
|
||||
|
||||
txt = logmsg(verbose, txt,
|
||||
indent2 + "- attempting incremental update...\n")
|
||||
try:
|
||||
updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date,
|
||||
verbose, indent2)
|
||||
txt = logmsg(verbose, txt, updatetxt)
|
||||
update_successful = True
|
||||
except SqliteUpdateError as e:
|
||||
txt = logmsg(
|
||||
verbose, txt,
|
||||
indent2 + "- incremental update failed: {}\n".format(e.reason))
|
||||
|
||||
return update_successful, txt
|
||||
|
|
33
crawler
33
crawler
|
@ -33,6 +33,9 @@ import dateutil.parser
|
|||
import time
|
||||
import getopt
|
||||
|
||||
# Script should run with python 3.4 or 3.5
|
||||
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
|
||||
|
||||
|
||||
def write_log(dir, fname, text):
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
|
@ -79,6 +82,18 @@ def log_failures_to_file(dir, today, res):
|
|||
"")
|
||||
write_log(dir, today + "-file-corruption.log", file_corruption)
|
||||
|
||||
sql_exception = reduce(
|
||||
lambda x, y: x + "\n" + y,
|
||||
sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))),
|
||||
"")
|
||||
write_log(dir, today + "-sql-exception.log", sql_exception)
|
||||
|
||||
sql_success = reduce(
|
||||
lambda x, y: x + "\n" + y,
|
||||
sorted(map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))),
|
||||
"")
|
||||
write_log(dir, today + "-sql-not-updated.log", sql_success)
|
||||
|
||||
|
||||
def log_summary(verbose, res, stderr=False, runtime=0):
|
||||
def p(s):
|
||||
|
@ -95,6 +110,8 @@ def log_summary(verbose, res, stderr=False, runtime=0):
|
|||
not_in_store = len(list(filter(lambda x: x.not_in_store(), res)))
|
||||
not_modified = len(list(filter(lambda x: x.not_modified(), res)))
|
||||
corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
|
||||
sql_exception = len(list(filter(lambda x: x.sql_exception(), res)))
|
||||
sql_success = len(list(filter(lambda x: x.sql_success(), res)))
|
||||
|
||||
new = len(list(filter(lambda x: x.is_new(), res)))
|
||||
updated = len(
|
||||
|
@ -105,6 +122,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
|
|||
p(" Updated {} out of {} extensions successfully\n".format(
|
||||
str(success), str(total)))
|
||||
p(" Updated extensions: {:8d}\n".format(updated))
|
||||
p(" Updated SQL databases: {:8d}\n".format(sql_success))
|
||||
p(" New extensions: {:8d}\n".format(new))
|
||||
p(" Not authorized: {:8d}\n".format(not_authorized))
|
||||
p(" Raised Google DDOS: {:8d}\n".format(raised_ddos))
|
||||
|
@ -112,6 +130,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
|
|||
p(" Extensions not in store: {:8d}\n".format(not_in_store))
|
||||
p(" Unknown exception: {:8d}\n".format(has_exception))
|
||||
p(" Corrupt tar archives: {:8d}\n".format(len(corrupt_tar_archives)))
|
||||
p(" SQL exception: {:8d}\n".format(sql_exception))
|
||||
p(" Total runtime: {}\n".format(
|
||||
str(datetime.timedelta(seconds=int(runtime)))))
|
||||
|
||||
|
@ -135,10 +154,11 @@ def help():
|
|||
def main(argv):
|
||||
today = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
||||
basedir = "archive"
|
||||
parallel = 24
|
||||
verbose = True
|
||||
discover = False
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "hsda:", ["archive="])
|
||||
opts, args = getopt.getopt(argv, "hsdap:", ["archive=",'parallel='])
|
||||
except getopt.GetoptError:
|
||||
help()
|
||||
sys.exit(2)
|
||||
|
@ -148,6 +168,8 @@ def main(argv):
|
|||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
basedir = arg
|
||||
elif opt in ("-p", "--parallel"):
|
||||
parallel = int(arg)
|
||||
elif opt == '-s':
|
||||
verbose = False
|
||||
elif opt == '-d':
|
||||
|
@ -165,9 +187,10 @@ def main(argv):
|
|||
|
||||
log(verbose, "Configuration:\n")
|
||||
log(verbose, " Base dir: {}\n".format(basedir))
|
||||
log(verbose, " Archive dir: {}\n".format(archive_dir))
|
||||
log(verbose, " Conf. dir: {}\n".format(conf_dir))
|
||||
log(verbose, " Discover new ext.: {}\n".format(discover))
|
||||
log(verbose, " Archive directory: {}\n".format(archive_dir))
|
||||
log(verbose, " Configuration directory: {}\n".format(conf_dir))
|
||||
log(verbose, " Discover new extensions: {}\n".format(discover))
|
||||
log(verbose, " Max num. of concurrent downloads: {}\n".format(parallel))
|
||||
log(verbose, "\n")
|
||||
|
||||
forum_ext_ids = get_forum_ext_ids(conf_dir, verbose)
|
||||
|
@ -178,7 +201,7 @@ def main(argv):
|
|||
discovered_ids = get_new_ids(verbose, known_ids)
|
||||
ext_ids = list(set(discovered_ids) | set(known_ids))
|
||||
|
||||
res = update_extensions(archive_dir, verbose, forum_ext_ids, ext_ids)
|
||||
res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, ext_ids)
|
||||
|
||||
# We re-try (once) the extensions with unknown exceptions, as
|
||||
# they are often temporary
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import getopt
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import tarfile
|
||||
import tempfile
|
||||
|
||||
from ExtensionCrawler.sqlite import *
|
||||
|
||||
|
||||
def setup_tables(con):
|
||||
con.execute("""CREATE TABLE review ("""
|
||||
"""id INTEGER PRIMARY KEY,"""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""user TEXT,"""
|
||||
"""reviewdate TEXT,"""
|
||||
"""rating TEXT,"""
|
||||
"""comment TEXT"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE category ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""category TEXT,"""
|
||||
"""PRIMARY KEY (extid, date, category)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE permission ("""
|
||||
"""crx_etag TEXT,"""
|
||||
"""permission TEXT,"""
|
||||
"""PRIMARY KEY (crx_etag, permission)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE crx ("""
|
||||
"""etag TEXT PRIMARY KEY,"""
|
||||
"""filename TEXT,"""
|
||||
"""publickey BLOB"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE extension ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""name TEXT,"""
|
||||
"""version TEXT,"""
|
||||
"""description TEXT,"""
|
||||
"""downloads INTEGER,"""
|
||||
"""fulldescription TEXT,"""
|
||||
"""developer TEXT,"""
|
||||
"""crx_etag TEXT,"""
|
||||
"""lastupdated TEXT,"""
|
||||
"""crx_status INTEGER,"""
|
||||
"""overview_status INTEGER,"""
|
||||
"""PRIMARY KEY (extid, date),"""
|
||||
"""FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
|
||||
""")""")
|
||||
|
||||
|
||||
def help():
|
||||
print("create_db [OPTION]")
|
||||
print(" -h print this help text")
|
||||
print(" -a=<DIR> archive directory")
|
||||
print(" -p=<PREFIX> three-letter-prefix")
|
||||
|
||||
|
||||
def main(argv):
|
||||
basedir = "archive"
|
||||
prefix = ""
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="])
|
||||
except getopt.GetoptError:
|
||||
help()
|
||||
sys.exit(2)
|
||||
for opt, arg in opts:
|
||||
if opt == '-h':
|
||||
help()
|
||||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
basedir = arg
|
||||
elif opt in ("-p", "--prefix"):
|
||||
prefix = arg
|
||||
|
||||
archive_dir = os.path.join(basedir, "data")
|
||||
threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*"))
|
||||
for threeletterdir in threeletterdirs:
|
||||
for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]):
|
||||
tarpath = os.path.join(threeletterdir, ext_id + ".tar")
|
||||
dbpath = os.path.join(threeletterdir, ext_id + ".sqlite")
|
||||
if os.path.exists(dbpath):
|
||||
os.remove(dbpath)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with tarfile.open(tarpath) as t:
|
||||
t.extractall(tmpdir)
|
||||
iddir = os.path.join(tmpdir, ext_id)
|
||||
|
||||
with sqlite3.connect(dbpath) as con:
|
||||
setup_tables(con)
|
||||
for date in sorted(os.listdir(iddir)):
|
||||
datepath = os.path.join(iddir, date)
|
||||
print(
|
||||
update_sqlite_incremental(dbpath, datepath, ext_id,
|
||||
date, True, ""))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
173
permstats.py
173
permstats.py
|
@ -1,173 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (C) 2016 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
from zipfile import ZipFile
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from jsmin import jsmin
|
||||
import re
|
||||
|
||||
regex_concrete_url = re.compile(r'^.*://.*[a-z0-9]+\.[a-z]+.*$')
|
||||
|
||||
|
||||
class PermissionHandlerPrintNames:
|
||||
def __init__(self, permname):
|
||||
self.permname = permname
|
||||
self.extinfo = {}
|
||||
|
||||
def handle_permission(self, extid, permobj, path):
|
||||
if self.permname in str(permobj):
|
||||
with open(os.path.join(path, 'metadata.json')) as f:
|
||||
metadata = json.load(f)
|
||||
self.extinfo[extid] = '{} | {} | {}'.format(metadata[1],
|
||||
metadata[6], path)
|
||||
|
||||
def print_result(self, fileobj, delim):
|
||||
fileobj.write('Extensions that use permission "{}":\n\n'.format(
|
||||
self.permname))
|
||||
for extid in self.extinfo:
|
||||
fileobj.write('{}\n'.format(self.extinfo[extid]))
|
||||
fileobj.write('\n\n')
|
||||
|
||||
|
||||
class PermissionHandler:
|
||||
def __init__(self):
|
||||
self.permissions = {}
|
||||
self.extids = set()
|
||||
|
||||
def handle_permission(self, extid, permobj, path):
|
||||
self.extids.add(extid)
|
||||
perm = str(permobj)
|
||||
if not perm in self.permissions:
|
||||
self.permissions[perm] = 0
|
||||
self.permissions[perm] += 1
|
||||
|
||||
def print_result(self, fileobj, delim):
|
||||
fileobj.write('Total: {} extensions\n'.format(len(self.extids)))
|
||||
for perm in sorted(
|
||||
self.permissions, key=self.permissions.get, reverse=True):
|
||||
fileobj.write('{}{}{}{}{:.2%}\n'.format(
|
||||
perm, delim, self.permissions[perm], delim,
|
||||
float(self.permissions[perm]) / len(self.extids)))
|
||||
fileobj.write('\n\n')
|
||||
|
||||
|
||||
class PermissionHandlerCondensed:
|
||||
def __init__(self):
|
||||
self.permissions = {}
|
||||
self.extids = set()
|
||||
self.exts_with_concrete_urls = set()
|
||||
|
||||
def handle_permission(self, extid, permobj, path):
|
||||
self.extids.add(extid)
|
||||
|
||||
perm = str(permobj)
|
||||
if regex_concrete_url.match(perm):
|
||||
if extid in self.exts_with_concrete_urls:
|
||||
return
|
||||
self.exts_with_concrete_urls.add(extid)
|
||||
perm = '<<<{}>>>'.format(regex_concrete_url.pattern)
|
||||
if not perm in self.permissions:
|
||||
self.permissions[perm] = 0
|
||||
self.permissions[perm] += 1
|
||||
|
||||
def print_result(self, fileobj, delim):
|
||||
fileobj.write('Condensed. Total: {} extensions\n'.format(
|
||||
len(self.extids)))
|
||||
for perm in sorted(
|
||||
self.permissions, key=self.permissions.get, reverse=True):
|
||||
fileobj.write('{}{}{}{}{:.2%}\n'.format(
|
||||
perm, delim, self.permissions[perm], delim,
|
||||
float(self.permissions[perm]) / len(self.extids)))
|
||||
fileobj.write('\n\n')
|
||||
|
||||
|
||||
class PermissionStatisticGenerator:
|
||||
def run(category_folder, permhandlers):
|
||||
for root, dirs, files in os.walk(category_folder):
|
||||
crxfile = next((f for f in files if f.endswith('.crx')), None)
|
||||
if crxfile:
|
||||
extid = os.path.basename(root)
|
||||
with ZipFile(os.path.join(root, crxfile)) as zipfile:
|
||||
with zipfile.open('manifest.json') as f:
|
||||
content = jsmin(f.read().decode())
|
||||
|
||||
# This is needed to strip weird BOMs ...
|
||||
first_bracket = content.find('{')
|
||||
if first_bracket >= 0:
|
||||
content = content[first_bracket:]
|
||||
|
||||
manifest = json.loads(content)
|
||||
if 'permissions' in manifest:
|
||||
for permobj in manifest['permissions']:
|
||||
for handler in permhandlers:
|
||||
handler.handle_permission(extid, permobj,
|
||||
root)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Prints statistics about the requested permissions of downloaded extensions.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'dir',
|
||||
help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d',
|
||||
'--delim',
|
||||
default='\t',
|
||||
help='Delimiter used for the statistics output.')
|
||||
parser.add_argument(
|
||||
'-o',
|
||||
'--output',
|
||||
default=sys.stdout,
|
||||
type=argparse.FileType('w'),
|
||||
help='Save the statistics into a file.')
|
||||
parser.add_argument(
|
||||
'-p',
|
||||
'--permission',
|
||||
help='Prints out all extension names and descriptions that use the given permission.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c',
|
||||
'--categories',
|
||||
action='store_true',
|
||||
help='Print the results for each category separately.')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
category_folders = [args.dir]
|
||||
if args.categories:
|
||||
category_folders += [
|
||||
os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1]
|
||||
]
|
||||
|
||||
for category_folder in category_folders:
|
||||
args.output.write('Results for category {}:\n\n'.format(
|
||||
category_folder))
|
||||
if args.permission:
|
||||
handlers = [PermissionHandlerPrintNames(args.permission)]
|
||||
else:
|
||||
handlers = [PermissionHandler(), PermissionHandlerCondensed()]
|
||||
PermissionStatisticGenerator.run(category_folder, handlers)
|
||||
|
||||
for handler in handlers:
|
||||
handler.print_result(args.output, args.delim)
|
Loading…
Reference in New Issue