Merge branch 'master' into production
This commit is contained in:
commit
8dbd535e3e
|
@ -58,4 +58,7 @@ docs/_build/
|
|||
# PyBuilder
|
||||
target/
|
||||
|
||||
# vi
|
||||
*.swp
|
||||
|
||||
archive
|
||||
|
|
|
@ -76,7 +76,7 @@ class RequestResult:
|
|||
|
||||
class UpdateResult:
|
||||
def __init__(self, id, is_new, exception, res_overview, res_crx,
|
||||
res_reviews, res_support,res_sql, sql_update):
|
||||
res_reviews, res_support, res_sql, sql_update):
|
||||
self.id = id
|
||||
self.new = is_new
|
||||
self.exception = exception
|
||||
|
@ -111,18 +111,18 @@ class UpdateResult:
|
|||
(self.res_support is not None and self.res_support.not_found()))
|
||||
|
||||
def has_exception(self):
|
||||
return (
|
||||
self.res_overview.has_exception() or
|
||||
self.res_crx.has_exception() or
|
||||
(self.res_reviews is not None and self.res_reviews.has_exception())
|
||||
or (self.res_support is not None and
|
||||
self.res_support.has_exception()))
|
||||
return (self.res_overview.has_exception() or
|
||||
self.res_crx.has_exception() or
|
||||
(self.res_reviews is not None and
|
||||
self.res_reviews.has_exception()) or (
|
||||
self.res_support is not None and
|
||||
self.res_support.has_exception()))
|
||||
|
||||
def raised_google_ddos(self):
|
||||
return (
|
||||
(self.res_reviews is not None and self.res_reviews.not_available())
|
||||
or (self.res_support is not None and
|
||||
self.res_support.not_available()))
|
||||
return ((self.res_reviews is not None and
|
||||
self.res_reviews.not_available()) or
|
||||
(self.res_support is not None and
|
||||
self.res_support.not_available()))
|
||||
|
||||
def not_modified(self):
|
||||
return self.res_crx.not_modified()
|
||||
|
@ -132,7 +132,7 @@ class UpdateResult:
|
|||
|
||||
def sql_exception(self):
|
||||
return self.res_sql is not None
|
||||
|
||||
|
||||
def sql_success(self):
|
||||
return self.sql_update
|
||||
|
||||
|
@ -261,11 +261,10 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
|
|||
extfilename = "default.crx"
|
||||
|
||||
if res.status_code == 304:
|
||||
res = requests.head(
|
||||
etag = requests.head(
|
||||
const_download_url().format(ext_id),
|
||||
timeout=10,
|
||||
allow_redirects=True)
|
||||
etag = res.headers.get('Etag')
|
||||
allow_redirects=True).headers.get('ETag')
|
||||
write_text(tmptardir, date, extfilename + ".etag", etag)
|
||||
logtxt = logmsg(verbose, logtxt, (
|
||||
" - checking etag, last: {}\n" +
|
||||
|
@ -389,7 +388,8 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
|||
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
|
||||
tar_exception = e
|
||||
return UpdateResult(ext_id, is_new, tar_exception, res_overview,
|
||||
res_crx, res_reviews, res_support, sql_exception, False)
|
||||
res_crx, res_reviews, res_support, sql_exception,
|
||||
False)
|
||||
|
||||
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
|
||||
ext_id)
|
||||
|
@ -453,10 +453,11 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
|||
pass
|
||||
|
||||
try:
|
||||
sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new,
|
||||
verbose, 11 * " ")
|
||||
logtxt = logmsg(verbose, logtxt, " * Updating db...\n")
|
||||
msg_updatesqlite = update_sqlite_incremental(
|
||||
archivedir, tmptardir, ext_id, date, verbose, 15 * " ")
|
||||
logtxt = logmsg(verbose, logtxt, msg_updatesqlite)
|
||||
|
||||
sql_success = True
|
||||
except Exception as e:
|
||||
logtxt = logmsg(verbose, logtxt,
|
||||
" * Exception during update of sqlite db ")
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
|
@ -85,3 +85,8 @@ def get_local_archive_dir(id):
|
|||
def archive_file(archivedir, ext_id):
|
||||
return os.path.join(
|
||||
str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
|
||||
|
||||
|
||||
def db_file(archivedir, ext_id):
|
||||
return os.path.join(archivedir,
|
||||
get_local_archive_dir(ext_id), ext_id + ".sqlite")
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
from ExtensionCrawler.config import *
|
||||
from ExtensionCrawler.util import *
|
||||
from ExtensionCrawler.crx import *
|
||||
|
||||
from ExtensionCrawler.archive import *
|
||||
|
||||
import sqlite3
|
||||
|
@ -27,33 +26,82 @@ from bs4 import BeautifulSoup
|
|||
from zipfile import ZipFile
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import tarfile
|
||||
import glob
|
||||
|
||||
|
||||
class SqliteUpdateError(Exception):
|
||||
def __init__(self, reason="unknown"):
|
||||
self.reason = reason
|
||||
def setup_tables(con):
|
||||
con.execute("""CREATE TABLE review ("""
|
||||
"""id INTEGER PRIMARY KEY,"""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""author TEXT,"""
|
||||
"""displayname TEXT,"""
|
||||
"""reviewdate INTEGER,"""
|
||||
"""rating INTEGER,"""
|
||||
"""language TEXT,"""
|
||||
"""shortauthor TEXT,"""
|
||||
"""comment TEXT"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE category ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""category TEXT,"""
|
||||
"""PRIMARY KEY (extid, date, category)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE permission ("""
|
||||
"""crx_etag TEXT,"""
|
||||
"""permission TEXT,"""
|
||||
"""PRIMARY KEY (crx_etag, permission)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE crx ("""
|
||||
"""etag TEXT PRIMARY KEY,"""
|
||||
"""filename TEXT,"""
|
||||
"""publickey BLOB"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE status ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""crx_status INTEGER,"""
|
||||
"""overview_status INTEGER,"""
|
||||
"""overview_exception TEXT,"""
|
||||
"""PRIMARY KEY (extid, date)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE extension ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""name TEXT,"""
|
||||
"""version TEXT,"""
|
||||
"""description TEXT,"""
|
||||
"""downloads INTEGER,"""
|
||||
"""fulldescription TEXT,"""
|
||||
"""developer TEXT,"""
|
||||
"""crx_etag TEXT,"""
|
||||
"""lastupdated TEXT,"""
|
||||
"""PRIMARY KEY (extid, date),"""
|
||||
"""FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
|
||||
""")""")
|
||||
|
||||
|
||||
def get_etag(ext_id, datepath, con):
|
||||
#Trying etag file
|
||||
etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None)
|
||||
if etagpath:
|
||||
with open(etagpath) as f:
|
||||
return f.read()
|
||||
def get_etag(ext_id, datepath, con, verbose, indent):
|
||||
txt = ""
|
||||
|
||||
#Trying to parse header file for etag
|
||||
# Trying to parse header file for etag
|
||||
headerpath = next(
|
||||
iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
|
||||
if headerpath:
|
||||
with open(headerpath) as f:
|
||||
headers = eval(f.read())
|
||||
if "ETag" in headers:
|
||||
return headers["ETag"]
|
||||
content = f.read()
|
||||
try:
|
||||
headers = eval(content)
|
||||
if "ETag" in headers:
|
||||
return headers["ETag"], txt
|
||||
except Exception:
|
||||
txt = logmsg(
|
||||
verbose, txt,
|
||||
indent + "* WARNING: could not parse crx header file")
|
||||
pass
|
||||
|
||||
#Trying to look up previous etag in database
|
||||
# Trying to look up previous etag in database
|
||||
linkpath = next(
|
||||
iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
|
||||
if linkpath:
|
||||
|
@ -66,12 +114,16 @@ def get_etag(ext_id, datepath, con):
|
|||
"SELECT crx_etag FROM extension WHERE extid=? AND date=?",
|
||||
(ext_id, linked_date)), None)
|
||||
if row:
|
||||
return row[0]
|
||||
return row[0], txt
|
||||
|
||||
return None, txt
|
||||
|
||||
|
||||
def get_overview_status(datepath):
|
||||
with open(os.path.join(datepath, "overview.html.status")) as f:
|
||||
return int(f.read())
|
||||
overviewstatuspath = os.path.join(datepath, "overview.html.status")
|
||||
if os.path.exists(overviewstatuspath):
|
||||
with open(overviewstatuspath) as f:
|
||||
return int(f.read())
|
||||
|
||||
|
||||
def get_crx_status(datepath):
|
||||
|
@ -81,141 +133,222 @@ def get_crx_status(datepath):
|
|||
with open(statuspath) as f:
|
||||
return int(f.read())
|
||||
|
||||
# If the extension is paid, we will find a main.headers file...
|
||||
statuspath = os.path.join(datepath, "main.status")
|
||||
if os.path.exists(statuspath):
|
||||
with open(statuspath) as f:
|
||||
return int(f.read())
|
||||
|
||||
# ... or an default.crx.headers file
|
||||
statuspath = os.path.join(datepath, "default.crx.status")
|
||||
if os.path.exists(statuspath):
|
||||
with open(statuspath) as f:
|
||||
return int(f.read())
|
||||
|
||||
|
||||
def parse_and_insert_overview(ext_id, date, datepath, con, verbose, indent):
|
||||
txt = ""
|
||||
|
||||
def parse_and_insert_overview(ext_id, date, datepath, con):
|
||||
overview_path = os.path.join(datepath, "overview.html")
|
||||
with open(overview_path) as overview_file:
|
||||
contents = overview_file.read()
|
||||
if os.path.exists(overview_path):
|
||||
with open(overview_path) as overview_file:
|
||||
contents = overview_file.read()
|
||||
|
||||
# Extract extension name
|
||||
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
||||
contents)
|
||||
name = match.group(1) if match else None
|
||||
# Extract extension name
|
||||
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
||||
contents)
|
||||
name = match.group(1) if match else None
|
||||
|
||||
# Extract extension version
|
||||
match = re.search("""<meta itemprop="version" content="(.*?)"\s*/>""",
|
||||
contents)
|
||||
version = match.group(1) if match else None
|
||||
# Extract extension version
|
||||
match = re.search(
|
||||
"""<meta itemprop="version" content="(.*?)"\s*/>""", contents)
|
||||
version = match.group(1) if match else None
|
||||
|
||||
# Extracts extension categories
|
||||
match = re.search("""Attribute name="category">(.+?)</Attribute>""",
|
||||
contents)
|
||||
categories = match.group(1).split(",") if match else None
|
||||
# Extracts extension categories
|
||||
match = re.search(
|
||||
"""Attribute name="category">(.+?)</Attribute>""", contents)
|
||||
categories = match.group(1).split(",") if match else None
|
||||
|
||||
# Extracts the number of downloads
|
||||
match = re.search("""user_count.*?(\d+)""", contents)
|
||||
downloads = int(match.group(1)) if match else None
|
||||
# Extracts the number of downloads
|
||||
match = re.search("""user_count.*?(\d+)""", contents)
|
||||
downloads = int(match.group(1)) if match else None
|
||||
|
||||
# Extracts the full extension description as it appears on the overview page
|
||||
doc = BeautifulSoup(contents, 'html.parser')
|
||||
# Extracts the full extension description as it appears on the
|
||||
# overview page
|
||||
doc = BeautifulSoup(contents, 'html.parser')
|
||||
|
||||
description_parent = doc.find('div', itemprop="description")
|
||||
description = str(description_parent.contents[
|
||||
0]) if description_parent and description_parent.contents else None
|
||||
full_description = str(
|
||||
description_parent.parent) if description_parent else None
|
||||
description_parent = doc.find('div', itemprop="description")
|
||||
description = str(
|
||||
description_parent.contents[0]
|
||||
) if description_parent and description_parent.contents else None
|
||||
full_description = str(
|
||||
description_parent.parent) if description_parent else None
|
||||
|
||||
developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls)
|
||||
developer = str(
|
||||
developer_parent.contents[0]) if developer_parent else None
|
||||
developer_parent = doc.find(
|
||||
class_=lambda cls: cls and "e-f-Me" in cls)
|
||||
developer = str(
|
||||
developer_parent.contents[0]) if developer_parent else None
|
||||
|
||||
last_updated_parent = doc.find(
|
||||
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
|
||||
last_updated = str(
|
||||
last_updated_parent.contents[0]) if last_updated_parent else None
|
||||
last_updated_parent = doc.find(
|
||||
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
|
||||
last_updated = str(last_updated_parent.contents[
|
||||
0]) if last_updated_parent else None
|
||||
|
||||
etag = get_etag(ext_id, datepath, con)
|
||||
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
|
||||
txt = logmsg(verbose, txt, etag_msg)
|
||||
|
||||
overview_status = get_overview_status(datepath)
|
||||
con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?)",
|
||||
(ext_id, date, name, version, description, downloads,
|
||||
full_description, developer, etag, last_updated))
|
||||
|
||||
crx_status = get_crx_status(datepath)
|
||||
if categories:
|
||||
for category in categories:
|
||||
con.execute("INSERT INTO category VALUES (?,?,?)",
|
||||
(ext_id, date, category))
|
||||
|
||||
con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(ext_id, date, name, version, description, downloads,
|
||||
full_description, developer, etag, last_updated,
|
||||
overview_status, crx_status))
|
||||
|
||||
if categories:
|
||||
for category in categories:
|
||||
con.execute("INSERT INTO category VALUES (?,?,?)",
|
||||
(ext_id, date, category))
|
||||
return txt
|
||||
|
||||
|
||||
def parse_and_insert_crx(ext_id, date, datepath, con):
|
||||
etag = get_etag(ext_id, datepath, con)
|
||||
def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent):
|
||||
txt = ""
|
||||
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
|
||||
filename = os.path.basename(crx_path)
|
||||
if crx_path:
|
||||
filename = os.path.basename(crx_path)
|
||||
|
||||
with ZipFile(crx_path) as f:
|
||||
with f.open("manifest.json") as m:
|
||||
try:
|
||||
with ZipFile(crx_path) as f:
|
||||
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
|
||||
txt = logmsg(verbose, txt, etag_msg)
|
||||
with f.open("manifest.json") as m:
|
||||
raw_content = m.read()
|
||||
# There are some manifests that seem to have weird encodings...
|
||||
manifest = json.loads(m.read().decode("utf-8-sig"))
|
||||
try:
|
||||
content = raw_content.decode("utf-8-sig")
|
||||
except UnicodeDecodeError:
|
||||
# Trying a different encoding, manifests are weird...
|
||||
content = raw_content.decode("latin1")
|
||||
|
||||
# Attempt to remove JavaScript-style comments from json
|
||||
comment_regex = re.compile(r'\s*//.*')
|
||||
multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/')
|
||||
lines = content.splitlines()
|
||||
for index, line in enumerate(lines):
|
||||
if comment_regex.match(
|
||||
line) or multiline_comment_regex.match(line):
|
||||
lines[index] = ""
|
||||
content = "\n".join(lines)
|
||||
|
||||
manifest = json.loads(content, strict=False)
|
||||
if "permissions" in manifest:
|
||||
for permission in manifest["permissions"]:
|
||||
con.execute(
|
||||
"INSERT OR REPLACE INTO permission VALUES (?,?)",
|
||||
(etag, str(permission)))
|
||||
except json.decoder.JSONDecodeError:
|
||||
pass
|
||||
|
||||
public_key = read_crx(crx_path).pk
|
||||
|
||||
con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
|
||||
public_key))
|
||||
|
||||
|
||||
def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose,
|
||||
indent):
|
||||
txt = ""
|
||||
|
||||
txt = logmsg(verbose, txt,
|
||||
indent + "- updating using {}\n".format(datepath))
|
||||
|
||||
if not os.path.exists(db_path):
|
||||
raise SqliteUpdateError("db file not found")
|
||||
|
||||
with sqlite3.connect(db_path) as con:
|
||||
parse_and_insert_overview(ext_id, date, datepath, con)
|
||||
|
||||
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
|
||||
|
||||
etag = get_etag(ext_id, datepath, con)
|
||||
etag_already_in_db = next(
|
||||
con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
|
||||
0]
|
||||
if etag and not etag_already_in_db:
|
||||
if crx_path:
|
||||
parse_and_insert_crx(ext_id, date, datepath, con)
|
||||
else:
|
||||
raise SqliteUpdateError(
|
||||
"etag not in db and no crx file present")
|
||||
public_key = read_crx(crx_path).pk
|
||||
|
||||
con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
|
||||
public_key))
|
||||
return txt
|
||||
|
||||
|
||||
def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent):
|
||||
update_successful = False
|
||||
def get(d, k):
|
||||
if d and k in d:
|
||||
return d[k]
|
||||
|
||||
|
||||
def parse_and_insert_review(ext_id, date, reviewpath, con):
|
||||
with open(reviewpath) as f:
|
||||
content = f.read()
|
||||
stripped = content[content.find('{"'):]
|
||||
d = json.JSONDecoder().raw_decode(stripped)
|
||||
annotations = get(next(iter(d), None), "annotations")
|
||||
if annotations:
|
||||
for review in d[0]["annotations"]:
|
||||
timestamp = get(review, "timestamp")
|
||||
starRating = get(review, "starRating")
|
||||
comment = get(review, "comment")
|
||||
displayname = get(get(review, "entity"), "displayName")
|
||||
author = get(get(review, "entity"), "author")
|
||||
language = get(get(review, "entity"), "language")
|
||||
shortauthor = get(get(review, "entity"), "shortAuthor")
|
||||
|
||||
con.execute("INSERT INTO review VALUES(?,?,?,?,?,?,?,?,?,?)",
|
||||
(None, ext_id, date, author, displayname,
|
||||
timestamp, starRating, language, shortauthor,
|
||||
comment))
|
||||
|
||||
|
||||
def parse_and_insert_status(ext_id, date, datepath, con):
|
||||
overview_status = get_overview_status(datepath)
|
||||
crx_status = get_crx_status(datepath)
|
||||
|
||||
overviewexceptionpath = os.path.join(datepath, "overview.html.exception")
|
||||
overview_exception = None
|
||||
if os.path.exists(overviewexceptionpath):
|
||||
with open(overviewexceptionpath) as f:
|
||||
overview_exception = f.read()
|
||||
|
||||
con.execute("INSERT INTO status VALUES (?,?,?,?,?)",
|
||||
(ext_id, date, crx_status, overview_status,
|
||||
overview_exception))
|
||||
|
||||
|
||||
def update_sqlite_incremental(archivedir, tmptardir, ext_id, date, verbose,
|
||||
indent):
|
||||
txt = ""
|
||||
indent2 = indent + 4 * " "
|
||||
|
||||
db_path = db_file(archivedir, ext_id)
|
||||
datepath = os.path.join(tmptardir, date)
|
||||
|
||||
txt = logmsg(verbose, txt,
|
||||
indent + "* extracting information into SQLite db...\n")
|
||||
indent + "- updating with data from {}\n".format(date))
|
||||
|
||||
db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite")
|
||||
if not os.path.exists(db_path):
|
||||
txt = logmsg(verbose, txt,
|
||||
indent2 + "* db file does not exist, creating...\n")
|
||||
with sqlite3.connect(db_path) as con:
|
||||
setup_tables(con)
|
||||
|
||||
txt = logmsg(verbose, txt,
|
||||
indent2 + "- attempting incremental update...\n")
|
||||
try:
|
||||
updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date,
|
||||
verbose, indent2)
|
||||
txt = logmsg(verbose, txt, updatetxt)
|
||||
update_successful = True
|
||||
except SqliteUpdateError as e:
|
||||
txt = logmsg(
|
||||
verbose, txt,
|
||||
indent2 + "- incremental update failed: {}\n".format(e.reason))
|
||||
with sqlite3.connect(db_path) as con:
|
||||
parse_and_insert_status(ext_id, date, datepath, con)
|
||||
|
||||
return update_successful, txt
|
||||
parse_and_insert_overview(ext_id, date, datepath, con, verbose,
|
||||
indent2)
|
||||
|
||||
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent2)
|
||||
txt = logmsg(verbose, txt, etag_msg)
|
||||
etag_already_in_db = next(
|
||||
con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
|
||||
0]
|
||||
|
||||
if etag:
|
||||
if not etag_already_in_db:
|
||||
try:
|
||||
crx_msg = parse_and_insert_crx(ext_id, date, datepath, con,
|
||||
verbose, indent)
|
||||
txt = logmsg(verbose, txt, crx_msg)
|
||||
except zipfile.BadZipfile as e:
|
||||
txt = logmsg(
|
||||
verbose, txt, indent2 +
|
||||
"* WARNING: the found crx file is not a zip file, exception: "
|
||||
)
|
||||
txt = logmsg(verbose, txt, str(e))
|
||||
txt = logmsg(verbose, txt, "\n")
|
||||
else:
|
||||
crx_status = get_crx_status(datepath)
|
||||
if crx_status != 401 and crx_status != 204 and crx_status != 404:
|
||||
txt = logmsg(verbose, txt,
|
||||
indent2 + "* WARNING: could not find etag\n")
|
||||
|
||||
reviewpaths = glob.glob(os.path.join(datepath, "reviews*.text"))
|
||||
for reviewpath in reviewpaths:
|
||||
try:
|
||||
parse_and_insert_review(ext_id, date, reviewpath, con)
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
txt = logmsg(
|
||||
verbose, txt,
|
||||
indent2 + "* Could not parse review file, exception: ")
|
||||
txt = logmsg(verbose, txt, str(e))
|
||||
txt = logmsg(verbose, txt, "\n")
|
||||
return txt
|
||||
|
|
109
create_db
109
create_db
|
@ -22,52 +22,11 @@ import sys
|
|||
import glob
|
||||
import tarfile
|
||||
import tempfile
|
||||
import traceback
|
||||
from multiprocessing import Pool
|
||||
|
||||
from ExtensionCrawler.sqlite import *
|
||||
|
||||
|
||||
def setup_tables(con):
|
||||
con.execute("""CREATE TABLE review ("""
|
||||
"""id INTEGER PRIMARY KEY,"""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""user TEXT,"""
|
||||
"""reviewdate TEXT,"""
|
||||
"""rating TEXT,"""
|
||||
"""comment TEXT"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE category ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""category TEXT,"""
|
||||
"""PRIMARY KEY (extid, date, category)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE permission ("""
|
||||
"""crx_etag TEXT,"""
|
||||
"""permission TEXT,"""
|
||||
"""PRIMARY KEY (crx_etag, permission)"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE crx ("""
|
||||
"""etag TEXT PRIMARY KEY,"""
|
||||
"""filename TEXT,"""
|
||||
"""publickey BLOB"""
|
||||
""")""")
|
||||
con.execute("""CREATE TABLE extension ("""
|
||||
"""extid TEXT,"""
|
||||
"""date TEXT,"""
|
||||
"""name TEXT,"""
|
||||
"""version TEXT,"""
|
||||
"""description TEXT,"""
|
||||
"""downloads INTEGER,"""
|
||||
"""fulldescription TEXT,"""
|
||||
"""developer TEXT,"""
|
||||
"""crx_etag TEXT,"""
|
||||
"""lastupdated TEXT,"""
|
||||
"""crx_status INTEGER,"""
|
||||
"""overview_status INTEGER,"""
|
||||
"""PRIMARY KEY (extid, date),"""
|
||||
"""FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
|
||||
""")""")
|
||||
from ExtensionCrawler.config import *
|
||||
|
||||
|
||||
def help():
|
||||
|
@ -75,13 +34,45 @@ def help():
|
|||
print(" -h print this help text")
|
||||
print(" -a=<DIR> archive directory")
|
||||
print(" -p=<PREFIX> three-letter-prefix")
|
||||
print(" -t=<THREADS> number of parallel threads")
|
||||
|
||||
|
||||
def process_id(archivedir, verbose, ext_id):
|
||||
txt = ""
|
||||
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
|
||||
|
||||
tarpath = archive_file(archivedir, ext_id)
|
||||
dbpath = db_file(archivedir, ext_id)
|
||||
if os.path.exists(dbpath):
|
||||
os.remove(dbpath)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with tarfile.open(tarpath) as t:
|
||||
t.extractall(tmpdir)
|
||||
iddir = os.path.join(tmpdir, ext_id)
|
||||
|
||||
for date in sorted(os.listdir(iddir)):
|
||||
try:
|
||||
update_txt = update_sqlite_incremental(
|
||||
archivedir, iddir, ext_id, date, True, "")
|
||||
txt = logmsg(verbose, txt, update_txt)
|
||||
except Exception as e:
|
||||
txt = logmsg(verbose, txt,
|
||||
"Exception when handling {} on {}:\n".format(
|
||||
ext_id, date))
|
||||
txt = logmsg(verbose, txt, traceback.format_exc())
|
||||
|
||||
txt = logmsg(verbose, txt, "\n")
|
||||
|
||||
return txt
|
||||
|
||||
|
||||
def main(argv):
|
||||
basedir = "archive"
|
||||
prefix = ""
|
||||
parallel = 8
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="])
|
||||
opts, args = getopt.getopt(argv, "ha:p:t:",
|
||||
["archive=", "prefix=", "threads="])
|
||||
except getopt.GetoptError:
|
||||
help()
|
||||
sys.exit(2)
|
||||
|
@ -93,27 +84,17 @@ def main(argv):
|
|||
basedir = arg
|
||||
elif opt in ("-p", "--prefix"):
|
||||
prefix = arg
|
||||
elif opt in ("-t", "--threads"):
|
||||
parallel = int(arg)
|
||||
|
||||
archive_dir = os.path.join(basedir, "data")
|
||||
threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*"))
|
||||
archivedir = os.path.join(basedir, "data")
|
||||
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
|
||||
for threeletterdir in threeletterdirs:
|
||||
for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]):
|
||||
tarpath = os.path.join(threeletterdir, ext_id + ".tar")
|
||||
dbpath = os.path.join(threeletterdir, ext_id + ".sqlite")
|
||||
if os.path.exists(dbpath):
|
||||
os.remove(dbpath)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with tarfile.open(tarpath) as t:
|
||||
t.extractall(tmpdir)
|
||||
iddir = os.path.join(tmpdir, ext_id)
|
||||
|
||||
with sqlite3.connect(dbpath) as con:
|
||||
setup_tables(con)
|
||||
for date in sorted(os.listdir(iddir)):
|
||||
datepath = os.path.join(iddir, date)
|
||||
print(
|
||||
update_sqlite_incremental(dbpath, datepath, ext_id,
|
||||
date, True, ""))
|
||||
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
|
||||
with Pool(parallel) as p:
|
||||
for txt in p.imap(partial(process_id, archivedir, True), ext_ids):
|
||||
sys.stdout.write(txt)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in New Issue