Merge branch 'master' into production

This commit is contained in:
Achim D. Brucker 2017-06-20 20:03:10 +01:00
commit 8dbd535e3e
5 changed files with 327 additions and 204 deletions

3
.gitignore vendored
View File

@ -58,4 +58,7 @@ docs/_build/
# PyBuilder
target/
# vi
*.swp
archive

View File

@ -76,7 +76,7 @@ class RequestResult:
class UpdateResult:
def __init__(self, id, is_new, exception, res_overview, res_crx,
res_reviews, res_support,res_sql, sql_update):
res_reviews, res_support, res_sql, sql_update):
self.id = id
self.new = is_new
self.exception = exception
@ -111,18 +111,18 @@ class UpdateResult:
(self.res_support is not None and self.res_support.not_found()))
def has_exception(self):
return (
self.res_overview.has_exception() or
self.res_crx.has_exception() or
(self.res_reviews is not None and self.res_reviews.has_exception())
or (self.res_support is not None and
self.res_support.has_exception()))
return (self.res_overview.has_exception() or
self.res_crx.has_exception() or
(self.res_reviews is not None and
self.res_reviews.has_exception()) or (
self.res_support is not None and
self.res_support.has_exception()))
def raised_google_ddos(self):
return (
(self.res_reviews is not None and self.res_reviews.not_available())
or (self.res_support is not None and
self.res_support.not_available()))
return ((self.res_reviews is not None and
self.res_reviews.not_available()) or
(self.res_support is not None and
self.res_support.not_available()))
def not_modified(self):
return self.res_crx.not_modified()
@ -132,7 +132,7 @@ class UpdateResult:
def sql_exception(self):
return self.res_sql is not None
def sql_success(self):
return self.sql_update
@ -261,11 +261,10 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
extfilename = "default.crx"
if res.status_code == 304:
res = requests.head(
etag = requests.head(
const_download_url().format(ext_id),
timeout=10,
allow_redirects=True)
etag = res.headers.get('Etag')
allow_redirects=True).headers.get('ETag')
write_text(tmptardir, date, extfilename + ".etag", etag)
logtxt = logmsg(verbose, logtxt, (
" - checking etag, last: {}\n" +
@ -389,7 +388,8 @@ def update_extension(archivedir, verbose, forums, ext_id):
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
tar_exception = e
return UpdateResult(ext_id, is_new, tar_exception, res_overview,
res_crx, res_reviews, res_support, sql_exception, False)
res_crx, res_reviews, res_support, sql_exception,
False)
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
ext_id)
@ -453,10 +453,11 @@ def update_extension(archivedir, verbose, forums, ext_id):
pass
try:
sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new,
verbose, 11 * " ")
logtxt = logmsg(verbose, logtxt, " * Updating db...\n")
msg_updatesqlite = update_sqlite_incremental(
archivedir, tmptardir, ext_id, date, verbose, 15 * " ")
logtxt = logmsg(verbose, logtxt, msg_updatesqlite)
sql_success = True
except Exception as e:
logtxt = logmsg(verbose, logtxt,
" * Exception during update of sqlite db ")

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -85,3 +85,8 @@ def get_local_archive_dir(id):
def archive_file(archivedir, ext_id):
return os.path.join(
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
def db_file(archivedir, ext_id):
return os.path.join(archivedir,
get_local_archive_dir(ext_id), ext_id + ".sqlite")

View File

@ -18,7 +18,6 @@
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
from ExtensionCrawler.crx import *
from ExtensionCrawler.archive import *
import sqlite3
@ -27,33 +26,82 @@ from bs4 import BeautifulSoup
from zipfile import ZipFile
import json
import os
import tempfile
import tarfile
import glob
class SqliteUpdateError(Exception):
def __init__(self, reason="unknown"):
self.reason = reason
def setup_tables(con):
con.execute("""CREATE TABLE review ("""
"""id INTEGER PRIMARY KEY,"""
"""extid TEXT,"""
"""date TEXT,"""
"""author TEXT,"""
"""displayname TEXT,"""
"""reviewdate INTEGER,"""
"""rating INTEGER,"""
"""language TEXT,"""
"""shortauthor TEXT,"""
"""comment TEXT"""
""")""")
con.execute("""CREATE TABLE category ("""
"""extid TEXT,"""
"""date TEXT,"""
"""category TEXT,"""
"""PRIMARY KEY (extid, date, category)"""
""")""")
con.execute("""CREATE TABLE permission ("""
"""crx_etag TEXT,"""
"""permission TEXT,"""
"""PRIMARY KEY (crx_etag, permission)"""
""")""")
con.execute("""CREATE TABLE crx ("""
"""etag TEXT PRIMARY KEY,"""
"""filename TEXT,"""
"""publickey BLOB"""
""")""")
con.execute("""CREATE TABLE status ("""
"""extid TEXT,"""
"""date TEXT,"""
"""crx_status INTEGER,"""
"""overview_status INTEGER,"""
"""overview_exception TEXT,"""
"""PRIMARY KEY (extid, date)"""
""")""")
con.execute("""CREATE TABLE extension ("""
"""extid TEXT,"""
"""date TEXT,"""
"""name TEXT,"""
"""version TEXT,"""
"""description TEXT,"""
"""downloads INTEGER,"""
"""fulldescription TEXT,"""
"""developer TEXT,"""
"""crx_etag TEXT,"""
"""lastupdated TEXT,"""
"""PRIMARY KEY (extid, date),"""
"""FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
""")""")
def get_etag(ext_id, datepath, con):
#Trying etag file
etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None)
if etagpath:
with open(etagpath) as f:
return f.read()
def get_etag(ext_id, datepath, con, verbose, indent):
txt = ""
#Trying to parse header file for etag
# Trying to parse header file for etag
headerpath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
if headerpath:
with open(headerpath) as f:
headers = eval(f.read())
if "ETag" in headers:
return headers["ETag"]
content = f.read()
try:
headers = eval(content)
if "ETag" in headers:
return headers["ETag"], txt
except Exception:
txt = logmsg(
verbose, txt,
indent + "* WARNING: could not parse crx header file")
pass
#Trying to look up previous etag in database
# Trying to look up previous etag in database
linkpath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
if linkpath:
@ -66,12 +114,16 @@ def get_etag(ext_id, datepath, con):
"SELECT crx_etag FROM extension WHERE extid=? AND date=?",
(ext_id, linked_date)), None)
if row:
return row[0]
return row[0], txt
return None, txt
def get_overview_status(datepath):
with open(os.path.join(datepath, "overview.html.status")) as f:
return int(f.read())
overviewstatuspath = os.path.join(datepath, "overview.html.status")
if os.path.exists(overviewstatuspath):
with open(overviewstatuspath) as f:
return int(f.read())
def get_crx_status(datepath):
@ -81,141 +133,222 @@ def get_crx_status(datepath):
with open(statuspath) as f:
return int(f.read())
# If the extension is paid, we will find a main.headers file...
statuspath = os.path.join(datepath, "main.status")
if os.path.exists(statuspath):
with open(statuspath) as f:
return int(f.read())
# ... or an default.crx.headers file
statuspath = os.path.join(datepath, "default.crx.status")
if os.path.exists(statuspath):
with open(statuspath) as f:
return int(f.read())
def parse_and_insert_overview(ext_id, date, datepath, con, verbose, indent):
txt = ""
def parse_and_insert_overview(ext_id, date, datepath, con):
overview_path = os.path.join(datepath, "overview.html")
with open(overview_path) as overview_file:
contents = overview_file.read()
if os.path.exists(overview_path):
with open(overview_path) as overview_file:
contents = overview_file.read()
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
contents)
name = match.group(1) if match else None
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
contents)
name = match.group(1) if match else None
# Extract extension version
match = re.search("""<meta itemprop="version" content="(.*?)"\s*/>""",
contents)
version = match.group(1) if match else None
# Extract extension version
match = re.search(
"""<meta itemprop="version" content="(.*?)"\s*/>""", contents)
version = match.group(1) if match else None
# Extracts extension categories
match = re.search("""Attribute name="category">(.+?)</Attribute>""",
contents)
categories = match.group(1).split(",") if match else None
# Extracts extension categories
match = re.search(
"""Attribute name="category">(.+?)</Attribute>""", contents)
categories = match.group(1).split(",") if match else None
# Extracts the number of downloads
match = re.search("""user_count.*?(\d+)""", contents)
downloads = int(match.group(1)) if match else None
# Extracts the number of downloads
match = re.search("""user_count.*?(\d+)""", contents)
downloads = int(match.group(1)) if match else None
# Extracts the full extension description as it appears on the overview page
doc = BeautifulSoup(contents, 'html.parser')
# Extracts the full extension description as it appears on the
# overview page
doc = BeautifulSoup(contents, 'html.parser')
description_parent = doc.find('div', itemprop="description")
description = str(description_parent.contents[
0]) if description_parent and description_parent.contents else None
full_description = str(
description_parent.parent) if description_parent else None
description_parent = doc.find('div', itemprop="description")
description = str(
description_parent.contents[0]
) if description_parent and description_parent.contents else None
full_description = str(
description_parent.parent) if description_parent else None
developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls)
developer = str(
developer_parent.contents[0]) if developer_parent else None
developer_parent = doc.find(
class_=lambda cls: cls and "e-f-Me" in cls)
developer = str(
developer_parent.contents[0]) if developer_parent else None
last_updated_parent = doc.find(
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
last_updated = str(
last_updated_parent.contents[0]) if last_updated_parent else None
last_updated_parent = doc.find(
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
last_updated = str(last_updated_parent.contents[
0]) if last_updated_parent else None
etag = get_etag(ext_id, datepath, con)
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
txt = logmsg(verbose, txt, etag_msg)
overview_status = get_overview_status(datepath)
con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?)",
(ext_id, date, name, version, description, downloads,
full_description, developer, etag, last_updated))
crx_status = get_crx_status(datepath)
if categories:
for category in categories:
con.execute("INSERT INTO category VALUES (?,?,?)",
(ext_id, date, category))
con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
(ext_id, date, name, version, description, downloads,
full_description, developer, etag, last_updated,
overview_status, crx_status))
if categories:
for category in categories:
con.execute("INSERT INTO category VALUES (?,?,?)",
(ext_id, date, category))
return txt
def parse_and_insert_crx(ext_id, date, datepath, con):
etag = get_etag(ext_id, datepath, con)
def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent):
txt = ""
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
filename = os.path.basename(crx_path)
if crx_path:
filename = os.path.basename(crx_path)
with ZipFile(crx_path) as f:
with f.open("manifest.json") as m:
try:
with ZipFile(crx_path) as f:
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
txt = logmsg(verbose, txt, etag_msg)
with f.open("manifest.json") as m:
raw_content = m.read()
# There are some manifests that seem to have weird encodings...
manifest = json.loads(m.read().decode("utf-8-sig"))
try:
content = raw_content.decode("utf-8-sig")
except UnicodeDecodeError:
# Trying a different encoding, manifests are weird...
content = raw_content.decode("latin1")
# Attempt to remove JavaScript-style comments from json
comment_regex = re.compile(r'\s*//.*')
multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/')
lines = content.splitlines()
for index, line in enumerate(lines):
if comment_regex.match(
line) or multiline_comment_regex.match(line):
lines[index] = ""
content = "\n".join(lines)
manifest = json.loads(content, strict=False)
if "permissions" in manifest:
for permission in manifest["permissions"]:
con.execute(
"INSERT OR REPLACE INTO permission VALUES (?,?)",
(etag, str(permission)))
except json.decoder.JSONDecodeError:
pass
public_key = read_crx(crx_path).pk
con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
public_key))
def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose,
indent):
txt = ""
txt = logmsg(verbose, txt,
indent + "- updating using {}\n".format(datepath))
if not os.path.exists(db_path):
raise SqliteUpdateError("db file not found")
with sqlite3.connect(db_path) as con:
parse_and_insert_overview(ext_id, date, datepath, con)
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
etag = get_etag(ext_id, datepath, con)
etag_already_in_db = next(
con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
0]
if etag and not etag_already_in_db:
if crx_path:
parse_and_insert_crx(ext_id, date, datepath, con)
else:
raise SqliteUpdateError(
"etag not in db and no crx file present")
public_key = read_crx(crx_path).pk
con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
public_key))
return txt
def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent):
update_successful = False
def get(d, k):
if d and k in d:
return d[k]
def parse_and_insert_review(ext_id, date, reviewpath, con):
with open(reviewpath) as f:
content = f.read()
stripped = content[content.find('{"'):]
d = json.JSONDecoder().raw_decode(stripped)
annotations = get(next(iter(d), None), "annotations")
if annotations:
for review in d[0]["annotations"]:
timestamp = get(review, "timestamp")
starRating = get(review, "starRating")
comment = get(review, "comment")
displayname = get(get(review, "entity"), "displayName")
author = get(get(review, "entity"), "author")
language = get(get(review, "entity"), "language")
shortauthor = get(get(review, "entity"), "shortAuthor")
con.execute("INSERT INTO review VALUES(?,?,?,?,?,?,?,?,?,?)",
(None, ext_id, date, author, displayname,
timestamp, starRating, language, shortauthor,
comment))
def parse_and_insert_status(ext_id, date, datepath, con):
overview_status = get_overview_status(datepath)
crx_status = get_crx_status(datepath)
overviewexceptionpath = os.path.join(datepath, "overview.html.exception")
overview_exception = None
if os.path.exists(overviewexceptionpath):
with open(overviewexceptionpath) as f:
overview_exception = f.read()
con.execute("INSERT INTO status VALUES (?,?,?,?,?)",
(ext_id, date, crx_status, overview_status,
overview_exception))
def update_sqlite_incremental(archivedir, tmptardir, ext_id, date, verbose,
indent):
txt = ""
indent2 = indent + 4 * " "
db_path = db_file(archivedir, ext_id)
datepath = os.path.join(tmptardir, date)
txt = logmsg(verbose, txt,
indent + "* extracting information into SQLite db...\n")
indent + "- updating with data from {}\n".format(date))
db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite")
if not os.path.exists(db_path):
txt = logmsg(verbose, txt,
indent2 + "* db file does not exist, creating...\n")
with sqlite3.connect(db_path) as con:
setup_tables(con)
txt = logmsg(verbose, txt,
indent2 + "- attempting incremental update...\n")
try:
updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date,
verbose, indent2)
txt = logmsg(verbose, txt, updatetxt)
update_successful = True
except SqliteUpdateError as e:
txt = logmsg(
verbose, txt,
indent2 + "- incremental update failed: {}\n".format(e.reason))
with sqlite3.connect(db_path) as con:
parse_and_insert_status(ext_id, date, datepath, con)
return update_successful, txt
parse_and_insert_overview(ext_id, date, datepath, con, verbose,
indent2)
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent2)
txt = logmsg(verbose, txt, etag_msg)
etag_already_in_db = next(
con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
0]
if etag:
if not etag_already_in_db:
try:
crx_msg = parse_and_insert_crx(ext_id, date, datepath, con,
verbose, indent)
txt = logmsg(verbose, txt, crx_msg)
except zipfile.BadZipfile as e:
txt = logmsg(
verbose, txt, indent2 +
"* WARNING: the found crx file is not a zip file, exception: "
)
txt = logmsg(verbose, txt, str(e))
txt = logmsg(verbose, txt, "\n")
else:
crx_status = get_crx_status(datepath)
if crx_status != 401 and crx_status != 204 and crx_status != 404:
txt = logmsg(verbose, txt,
indent2 + "* WARNING: could not find etag\n")
reviewpaths = glob.glob(os.path.join(datepath, "reviews*.text"))
for reviewpath in reviewpaths:
try:
parse_and_insert_review(ext_id, date, reviewpath, con)
except json.decoder.JSONDecodeError as e:
txt = logmsg(
verbose, txt,
indent2 + "* Could not parse review file, exception: ")
txt = logmsg(verbose, txt, str(e))
txt = logmsg(verbose, txt, "\n")
return txt

109
create_db
View File

@ -22,52 +22,11 @@ import sys
import glob
import tarfile
import tempfile
import traceback
from multiprocessing import Pool
from ExtensionCrawler.sqlite import *
def setup_tables(con):
con.execute("""CREATE TABLE review ("""
"""id INTEGER PRIMARY KEY,"""
"""extid TEXT,"""
"""date TEXT,"""
"""user TEXT,"""
"""reviewdate TEXT,"""
"""rating TEXT,"""
"""comment TEXT"""
""")""")
con.execute("""CREATE TABLE category ("""
"""extid TEXT,"""
"""date TEXT,"""
"""category TEXT,"""
"""PRIMARY KEY (extid, date, category)"""
""")""")
con.execute("""CREATE TABLE permission ("""
"""crx_etag TEXT,"""
"""permission TEXT,"""
"""PRIMARY KEY (crx_etag, permission)"""
""")""")
con.execute("""CREATE TABLE crx ("""
"""etag TEXT PRIMARY KEY,"""
"""filename TEXT,"""
"""publickey BLOB"""
""")""")
con.execute("""CREATE TABLE extension ("""
"""extid TEXT,"""
"""date TEXT,"""
"""name TEXT,"""
"""version TEXT,"""
"""description TEXT,"""
"""downloads INTEGER,"""
"""fulldescription TEXT,"""
"""developer TEXT,"""
"""crx_etag TEXT,"""
"""lastupdated TEXT,"""
"""crx_status INTEGER,"""
"""overview_status INTEGER,"""
"""PRIMARY KEY (extid, date),"""
"""FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
""")""")
from ExtensionCrawler.config import *
def help():
@ -75,13 +34,45 @@ def help():
print(" -h print this help text")
print(" -a=<DIR> archive directory")
print(" -p=<PREFIX> three-letter-prefix")
print(" -t=<THREADS> number of parallel threads")
def process_id(archivedir, verbose, ext_id):
txt = ""
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
tarpath = archive_file(archivedir, ext_id)
dbpath = db_file(archivedir, ext_id)
if os.path.exists(dbpath):
os.remove(dbpath)
with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(tarpath) as t:
t.extractall(tmpdir)
iddir = os.path.join(tmpdir, ext_id)
for date in sorted(os.listdir(iddir)):
try:
update_txt = update_sqlite_incremental(
archivedir, iddir, ext_id, date, True, "")
txt = logmsg(verbose, txt, update_txt)
except Exception as e:
txt = logmsg(verbose, txt,
"Exception when handling {} on {}:\n".format(
ext_id, date))
txt = logmsg(verbose, txt, traceback.format_exc())
txt = logmsg(verbose, txt, "\n")
return txt
def main(argv):
basedir = "archive"
prefix = ""
parallel = 8
try:
opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="])
opts, args = getopt.getopt(argv, "ha:p:t:",
["archive=", "prefix=", "threads="])
except getopt.GetoptError:
help()
sys.exit(2)
@ -93,27 +84,17 @@ def main(argv):
basedir = arg
elif opt in ("-p", "--prefix"):
prefix = arg
elif opt in ("-t", "--threads"):
parallel = int(arg)
archive_dir = os.path.join(basedir, "data")
threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*"))
archivedir = os.path.join(basedir, "data")
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
for threeletterdir in threeletterdirs:
for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]):
tarpath = os.path.join(threeletterdir, ext_id + ".tar")
dbpath = os.path.join(threeletterdir, ext_id + ".sqlite")
if os.path.exists(dbpath):
os.remove(dbpath)
with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(tarpath) as t:
t.extractall(tmpdir)
iddir = os.path.join(tmpdir, ext_id)
with sqlite3.connect(dbpath) as con:
setup_tables(con)
for date in sorted(os.listdir(iddir)):
datepath = os.path.join(iddir, date)
print(
update_sqlite_incremental(dbpath, datepath, ext_id,
date, True, ""))
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
with Pool(parallel) as p:
for txt in p.imap(partial(process_id, archivedir, True), ext_ids):
sys.stdout.write(txt)
sys.stdout.flush()
if __name__ == "__main__":