2017-06-16 10:06:04 +00:00
|
|
|
#
|
|
|
|
# Copyright (C) 2017 The University of Sheffield, UK
|
2017-06-16 13:56:23 +00:00
|
|
|
#
|
2017-06-16 10:06:04 +00:00
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
|
|
|
|
from ExtensionCrawler.config import *
|
|
|
|
from ExtensionCrawler.util import *
|
2017-06-16 13:56:23 +00:00
|
|
|
from ExtensionCrawler.crx import *
|
2017-06-19 04:57:17 +00:00
|
|
|
from ExtensionCrawler.archive import *
|
2017-06-16 13:56:23 +00:00
|
|
|
|
|
|
|
import sqlite3
|
|
|
|
import re
|
|
|
|
from bs4 import BeautifulSoup
|
2017-07-07 18:47:14 +00:00
|
|
|
import jsbeautifier
|
2017-06-16 13:56:23 +00:00
|
|
|
from zipfile import ZipFile
|
|
|
|
import json
|
2017-06-16 19:40:48 +00:00
|
|
|
import os
|
2017-06-17 16:10:18 +00:00
|
|
|
import glob
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-07-05 13:23:45 +00:00
|
|
|
|
2017-06-20 22:15:15 +00:00
|
|
|
class SelfclosingSqliteDB:
|
|
|
|
def __init__(self, filename):
|
|
|
|
self.filename = filename
|
2017-07-05 13:23:45 +00:00
|
|
|
|
2017-06-20 22:15:15 +00:00
|
|
|
def __enter__(self):
|
|
|
|
self.con = sqlite3.connect(self.filename)
|
|
|
|
return self.con
|
2017-07-05 13:23:45 +00:00
|
|
|
|
2017-06-20 22:15:15 +00:00
|
|
|
def __exit__(self, *args):
|
2017-06-20 22:47:31 +00:00
|
|
|
self.con.commit()
|
2017-06-20 22:15:15 +00:00
|
|
|
self.con.close()
|
|
|
|
|
2017-06-16 21:38:48 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
def setup_tables(con):
|
2017-07-12 16:04:56 +00:00
|
|
|
con.execute("""CREATE VIRTUAL TABLE review using fts4("""
|
2017-07-12 16:56:40 +00:00
|
|
|
"""author TEXT PRIMARY KEY,"""
|
2017-06-19 15:42:35 +00:00
|
|
|
"""extid TEXT,"""
|
|
|
|
"""date TEXT,"""
|
|
|
|
"""displayname TEXT,"""
|
|
|
|
"""reviewdate INTEGER,"""
|
|
|
|
"""rating INTEGER,"""
|
|
|
|
"""language TEXT,"""
|
|
|
|
"""shortauthor TEXT,"""
|
|
|
|
"""comment TEXT"""
|
|
|
|
""")""")
|
2017-07-12 16:56:40 +00:00
|
|
|
con.execute("""CREATE VIRTUAL TABLE reviewreplies using fts4("""
|
|
|
|
"""author TEXT PRIMARY KEY,"""
|
|
|
|
"""extid TEXT,"""
|
|
|
|
"""date TEXT,"""
|
|
|
|
"""displayname TEXT,"""
|
|
|
|
"""reviewdate INTEGER,"""
|
|
|
|
"""replyto TEXT,"""
|
|
|
|
"""language TEXT,"""
|
|
|
|
"""shortauthor TEXT,"""
|
|
|
|
"""comment TEXT"""
|
|
|
|
""")""")
|
2017-06-19 15:42:35 +00:00
|
|
|
con.execute("""CREATE TABLE category ("""
|
|
|
|
"""extid TEXT,"""
|
|
|
|
"""date TEXT,"""
|
|
|
|
"""category TEXT,"""
|
|
|
|
"""PRIMARY KEY (extid, date, category)"""
|
|
|
|
""")""")
|
2017-07-07 19:09:22 +00:00
|
|
|
con.execute("""CREATE TABLE content_script_url ("""
|
|
|
|
"""crx_etag TEXT,"""
|
|
|
|
"""url TEXT,"""
|
|
|
|
"""PRIMARY KEY (crx_etag, url)"""
|
|
|
|
""")""")
|
2017-06-19 15:42:35 +00:00
|
|
|
con.execute("""CREATE TABLE permission ("""
|
|
|
|
"""crx_etag TEXT,"""
|
|
|
|
"""permission TEXT,"""
|
|
|
|
"""PRIMARY KEY (crx_etag, permission)"""
|
|
|
|
""")""")
|
|
|
|
con.execute("""CREATE TABLE crx ("""
|
2017-07-10 11:46:41 +00:00
|
|
|
"""crx_etag TEXT PRIMARY KEY,"""
|
2017-06-19 15:42:35 +00:00
|
|
|
"""filename TEXT,"""
|
2017-07-07 18:47:14 +00:00
|
|
|
"""size INTEGER,"""
|
|
|
|
"""jsloc INTEGER,"""
|
2017-06-19 15:42:35 +00:00
|
|
|
"""publickey BLOB"""
|
|
|
|
""")""")
|
|
|
|
con.execute("""CREATE TABLE status ("""
|
|
|
|
"""extid TEXT,"""
|
|
|
|
"""date TEXT,"""
|
|
|
|
"""crx_status INTEGER,"""
|
|
|
|
"""overview_status INTEGER,"""
|
|
|
|
"""overview_exception TEXT,"""
|
|
|
|
"""PRIMARY KEY (extid, date)"""
|
|
|
|
""")""")
|
|
|
|
con.execute("""CREATE TABLE extension ("""
|
|
|
|
"""extid TEXT,"""
|
|
|
|
"""date TEXT,"""
|
|
|
|
"""name TEXT,"""
|
|
|
|
"""version TEXT,"""
|
|
|
|
"""description TEXT,"""
|
|
|
|
"""downloads INTEGER,"""
|
2017-07-05 13:23:45 +00:00
|
|
|
"""rating REAL,"""
|
|
|
|
"""ratingcount INTEGER,"""
|
2017-06-19 15:42:35 +00:00
|
|
|
"""fulldescription TEXT,"""
|
|
|
|
"""developer TEXT,"""
|
2017-07-07 18:29:51 +00:00
|
|
|
"""itemcategory TEXT,"""
|
2017-06-19 15:42:35 +00:00
|
|
|
"""crx_etag TEXT,"""
|
|
|
|
"""lastupdated TEXT,"""
|
|
|
|
"""PRIMARY KEY (extid, date),"""
|
2017-07-10 11:46:41 +00:00
|
|
|
"""FOREIGN KEY (crx_etag) REFERENCES crx(crx_etag)"""
|
2017-06-19 15:42:35 +00:00
|
|
|
""")""")
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-16 22:19:13 +00:00
|
|
|
|
2017-06-20 07:42:30 +00:00
|
|
|
def get_etag(ext_id, datepath, con, verbose, indent):
|
|
|
|
txt = ""
|
|
|
|
|
2017-06-20 22:47:31 +00:00
|
|
|
# Trying to parse etag file
|
2017-07-05 13:23:45 +00:00
|
|
|
etagpath = next(
|
|
|
|
iter(glob.glob(os.path.join(datepath, "*.crx.etag"))), None)
|
2017-06-20 22:47:31 +00:00
|
|
|
if etagpath:
|
|
|
|
with open(etagpath) as f:
|
|
|
|
return f.read(), txt
|
|
|
|
|
2017-06-20 10:22:54 +00:00
|
|
|
# Trying to parse header file for etag
|
2017-06-17 16:10:18 +00:00
|
|
|
headerpath = next(
|
|
|
|
iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
|
|
|
|
if headerpath:
|
|
|
|
with open(headerpath) as f:
|
2017-06-20 07:42:30 +00:00
|
|
|
content = f.read()
|
|
|
|
try:
|
|
|
|
headers = eval(content)
|
|
|
|
if "ETag" in headers:
|
|
|
|
return headers["ETag"], txt
|
|
|
|
except Exception:
|
|
|
|
txt = logmsg(
|
|
|
|
verbose, txt,
|
|
|
|
indent + "* WARNING: could not parse crx header file")
|
|
|
|
pass
|
2017-06-17 16:10:18 +00:00
|
|
|
|
2017-06-20 10:22:54 +00:00
|
|
|
# Trying to look up previous etag in database
|
2017-06-17 16:10:18 +00:00
|
|
|
linkpath = next(
|
|
|
|
iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
|
|
|
|
if linkpath:
|
|
|
|
with open(linkpath) as f:
|
|
|
|
link = f.read()
|
|
|
|
linked_date = link[3:].split("/")[0]
|
|
|
|
|
|
|
|
row = next(
|
|
|
|
con.execute(
|
|
|
|
"SELECT crx_etag FROM extension WHERE extid=? AND date=?",
|
|
|
|
(ext_id, linked_date)), None)
|
|
|
|
if row:
|
2017-06-20 07:42:30 +00:00
|
|
|
return row[0], txt
|
|
|
|
|
|
|
|
return None, txt
|
2017-06-17 16:10:18 +00:00
|
|
|
|
2017-06-16 22:19:13 +00:00
|
|
|
|
2017-06-16 19:40:48 +00:00
|
|
|
def get_overview_status(datepath):
|
2017-06-19 20:34:42 +00:00
|
|
|
overviewstatuspath = os.path.join(datepath, "overview.html.status")
|
|
|
|
if os.path.exists(overviewstatuspath):
|
|
|
|
with open(overviewstatuspath) as f:
|
2017-06-20 07:43:43 +00:00
|
|
|
return int(f.read())
|
2017-06-16 22:19:13 +00:00
|
|
|
|
2017-06-16 19:40:48 +00:00
|
|
|
|
|
|
|
def get_crx_status(datepath):
|
2017-06-17 16:10:18 +00:00
|
|
|
statuspath = next(
|
|
|
|
iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None)
|
|
|
|
if statuspath:
|
|
|
|
with open(statuspath) as f:
|
2017-06-20 07:43:43 +00:00
|
|
|
return int(f.read())
|
2017-06-16 19:40:48 +00:00
|
|
|
|
2017-06-19 17:41:29 +00:00
|
|
|
# If the extension is paid, we will find a main.headers file...
|
|
|
|
statuspath = os.path.join(datepath, "main.status")
|
|
|
|
if os.path.exists(statuspath):
|
|
|
|
with open(statuspath) as f:
|
|
|
|
return int(f.read())
|
|
|
|
|
|
|
|
# ... or an default.crx.headers file
|
|
|
|
statuspath = os.path.join(datepath, "default.crx.status")
|
|
|
|
if os.path.exists(statuspath):
|
|
|
|
with open(statuspath) as f:
|
|
|
|
return int(f.read())
|
|
|
|
|
2017-06-16 22:19:13 +00:00
|
|
|
|
2017-06-20 07:42:30 +00:00
|
|
|
def parse_and_insert_overview(ext_id, date, datepath, con, verbose, indent):
|
|
|
|
txt = ""
|
|
|
|
|
2017-06-17 16:10:18 +00:00
|
|
|
overview_path = os.path.join(datepath, "overview.html")
|
2017-06-19 15:42:35 +00:00
|
|
|
if os.path.exists(overview_path):
|
|
|
|
with open(overview_path) as overview_file:
|
|
|
|
contents = overview_file.read()
|
2017-06-16 19:40:48 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
# Extract extension name
|
|
|
|
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
|
|
|
contents)
|
|
|
|
name = match.group(1) if match else None
|
2017-06-16 19:40:48 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
# Extract extension version
|
|
|
|
match = re.search(
|
|
|
|
"""<meta itemprop="version" content="(.*?)"\s*/>""", contents)
|
|
|
|
version = match.group(1) if match else None
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-07-05 13:23:45 +00:00
|
|
|
match = re.search(
|
|
|
|
"""<meta itemprop="ratingValue" content="(.*?)"\s*/>""",
|
|
|
|
contents)
|
|
|
|
rating = float(match.group(1)) if match else None
|
|
|
|
|
|
|
|
match = re.search(
|
|
|
|
"""<meta itemprop="ratingCount" content="(.*?)"\s*/>""",
|
|
|
|
contents)
|
|
|
|
rating_count = int(match.group(1)) if match else None
|
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
# Extracts extension categories
|
|
|
|
match = re.search(
|
|
|
|
"""Attribute name="category">(.+?)</Attribute>""", contents)
|
|
|
|
categories = match.group(1).split(",") if match else None
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
# Extracts the number of downloads
|
2017-07-05 15:08:15 +00:00
|
|
|
match = re.search(
|
2017-07-07 18:29:51 +00:00
|
|
|
"""<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
|
|
|
|
contents)
|
|
|
|
downloads = int(match.group(1).replace(",", '')) if match else None
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-20 10:22:54 +00:00
|
|
|
# Extracts the full extension description as it appears on the
|
|
|
|
# overview page
|
2017-06-19 15:42:35 +00:00
|
|
|
doc = BeautifulSoup(contents, 'html.parser')
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
description_parent = doc.find('div', itemprop="description")
|
|
|
|
description = str(
|
|
|
|
description_parent.contents[0]
|
|
|
|
) if description_parent and description_parent.contents else None
|
|
|
|
full_description = str(
|
|
|
|
description_parent.parent) if description_parent else None
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
developer_parent = doc.find(
|
|
|
|
class_=lambda cls: cls and "e-f-Me" in cls)
|
2017-07-12 12:54:44 +00:00
|
|
|
developer = "".join([str(x) for x in developer_parent.contents
|
|
|
|
]) if developer_parent else None
|
2017-06-16 19:40:48 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
last_updated_parent = doc.find(
|
|
|
|
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
|
|
|
|
last_updated = str(last_updated_parent.contents[
|
|
|
|
0]) if last_updated_parent else None
|
|
|
|
|
2017-06-20 07:42:30 +00:00
|
|
|
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
|
2017-06-20 07:49:01 +00:00
|
|
|
txt = logmsg(verbose, txt, etag_msg)
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-07-07 18:29:51 +00:00
|
|
|
match = re.search(
|
|
|
|
"""<Attribute name="item_category">(.*?)</Attribute>""",
|
|
|
|
contents)
|
|
|
|
itemcategory = match.group(1) if match else None
|
|
|
|
|
2017-07-05 13:23:45 +00:00
|
|
|
con.execute(
|
2017-07-07 18:29:51 +00:00
|
|
|
"INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
2017-07-05 13:23:45 +00:00
|
|
|
(ext_id, date, name, version, description, downloads, rating,
|
2017-07-07 18:29:51 +00:00
|
|
|
rating_count, full_description, developer, itemcategory, etag,
|
2017-07-05 13:23:45 +00:00
|
|
|
last_updated))
|
2017-06-17 16:10:18 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
if categories:
|
|
|
|
for category in categories:
|
|
|
|
con.execute("INSERT INTO category VALUES (?,?,?)",
|
|
|
|
(ext_id, date, category))
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-20 07:42:30 +00:00
|
|
|
return txt
|
|
|
|
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-20 07:49:01 +00:00
|
|
|
def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent):
|
|
|
|
txt = ""
|
2017-06-19 15:42:35 +00:00
|
|
|
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
|
|
|
|
if crx_path:
|
|
|
|
filename = os.path.basename(crx_path)
|
|
|
|
|
|
|
|
with ZipFile(crx_path) as f:
|
2017-06-20 07:49:01 +00:00
|
|
|
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
|
|
|
|
txt = logmsg(verbose, txt, etag_msg)
|
2017-06-19 15:42:35 +00:00
|
|
|
with f.open("manifest.json") as m:
|
2017-06-20 07:28:50 +00:00
|
|
|
raw_content = m.read()
|
|
|
|
# There are some manifests that seem to have weird encodings...
|
2017-06-19 15:42:35 +00:00
|
|
|
try:
|
2017-06-20 07:28:50 +00:00
|
|
|
content = raw_content.decode("utf-8-sig")
|
|
|
|
except UnicodeDecodeError:
|
2017-06-20 10:22:54 +00:00
|
|
|
# Trying a different encoding, manifests are weird...
|
2017-06-20 07:28:50 +00:00
|
|
|
content = raw_content.decode("latin1")
|
2017-06-20 10:22:54 +00:00
|
|
|
|
|
|
|
# Attempt to remove JavaScript-style comments from json
|
|
|
|
comment_regex = re.compile(r'\s*//.*')
|
2017-06-20 14:22:09 +00:00
|
|
|
multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/')
|
2017-06-20 10:22:54 +00:00
|
|
|
lines = content.splitlines()
|
|
|
|
for index, line in enumerate(lines):
|
2017-06-20 14:22:09 +00:00
|
|
|
if comment_regex.match(
|
|
|
|
line) or multiline_comment_regex.match(line):
|
2017-06-20 10:22:54 +00:00
|
|
|
lines[index] = ""
|
|
|
|
content = "\n".join(lines)
|
|
|
|
|
2017-06-20 16:45:13 +00:00
|
|
|
manifest = json.loads(content, strict=False)
|
2017-06-20 07:28:50 +00:00
|
|
|
if "permissions" in manifest:
|
|
|
|
for permission in manifest["permissions"]:
|
|
|
|
con.execute(
|
|
|
|
"INSERT OR REPLACE INTO permission VALUES (?,?)",
|
|
|
|
(etag, str(permission)))
|
2017-07-07 19:09:22 +00:00
|
|
|
if "content_scripts" in manifest:
|
|
|
|
for csd in manifest["content_scripts"]:
|
|
|
|
if "matches" in csd:
|
|
|
|
for urlpattern in csd["matches"]:
|
|
|
|
con.execute(
|
|
|
|
"INSERT OR REPLACE INTO content_script_url VALUES (?,?)",
|
|
|
|
(etag, str(urlpattern)))
|
2017-06-19 15:42:35 +00:00
|
|
|
|
2017-07-07 18:47:14 +00:00
|
|
|
size = os.path.getsize(crx_path)
|
|
|
|
jsloc = 0
|
|
|
|
jsfiles = filter(lambda x: x.filename.endswith(".js"),
|
|
|
|
f.infolist())
|
|
|
|
for jsfile in jsfiles:
|
|
|
|
with f.open(jsfile) as jsf:
|
2017-07-07 21:17:56 +00:00
|
|
|
content = jsf.read().decode(errors="surrogateescape")
|
2017-07-07 18:47:14 +00:00
|
|
|
beautified = jsbeautifier.beautify(content)
|
|
|
|
lines = beautified.splitlines()
|
|
|
|
jsloc += len(lines)
|
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
public_key = read_crx(crx_path).pk
|
|
|
|
|
2017-07-07 18:47:14 +00:00
|
|
|
con.execute("INSERT INTO crx VALUES (?,?,?,?,?)",
|
|
|
|
(etag, filename, size, jsloc, public_key))
|
2017-06-20 07:49:01 +00:00
|
|
|
return txt
|
2017-06-19 15:42:35 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get(d, k):
|
|
|
|
if d and k in d:
|
|
|
|
return d[k]
|
|
|
|
|
|
|
|
|
|
|
|
def parse_and_insert_review(ext_id, date, reviewpath, con):
|
|
|
|
with open(reviewpath) as f:
|
|
|
|
content = f.read()
|
|
|
|
stripped = content[content.find('{"'):]
|
|
|
|
d = json.JSONDecoder().raw_decode(stripped)
|
2017-06-20 07:03:15 +00:00
|
|
|
annotations = get(next(iter(d), None), "annotations")
|
|
|
|
if annotations:
|
|
|
|
for review in d[0]["annotations"]:
|
|
|
|
timestamp = get(review, "timestamp")
|
|
|
|
starRating = get(review, "starRating")
|
|
|
|
comment = get(review, "comment")
|
|
|
|
displayname = get(get(review, "entity"), "displayName")
|
|
|
|
author = get(get(review, "entity"), "author")
|
2017-07-12 16:56:40 +00:00
|
|
|
language = get(review, "language")
|
2017-06-20 07:03:15 +00:00
|
|
|
shortauthor = get(get(review, "entity"), "shortAuthor")
|
|
|
|
|
2017-07-12 16:56:40 +00:00
|
|
|
con.execute("INSERT INTO review VALUES(?,?,?,?,?,?,?,?,?)",
|
|
|
|
(author, ext_id, date, displayname, timestamp,
|
|
|
|
starRating, language, shortauthor, comment))
|
|
|
|
|
|
|
|
|
|
|
|
def parse_and_insert_review_replies(ext_id, date, reviewsrepliespaths, con):
|
|
|
|
with open(reviewsrepliespaths) as f:
|
|
|
|
d = json.load(f)
|
|
|
|
for result in d["searchResults"]:
|
|
|
|
for annotation in result["annotations"]:
|
|
|
|
timestamp = get(annotation, "timestamp")
|
|
|
|
replyto = get(
|
|
|
|
get(get(annotation, "entity"), "annotation"), "author")
|
|
|
|
comment = get(annotation, "comment")
|
|
|
|
displayname = get(get(annotation, "entity"), "displayName")
|
|
|
|
author = get(get(annotation, "entity"), "author")
|
|
|
|
language = get(annotation, "language")
|
|
|
|
shortauthor = get(get(annotation, "entity"), "shortAuthor")
|
|
|
|
con.execute(
|
|
|
|
"INSERT INTO reviewreplies VALUES(?,?,?,?,?,?,?,?,?)",
|
|
|
|
(author, ext_id, date, displayname, timestamp, replyto,
|
|
|
|
language, shortauthor, comment))
|
2017-06-19 15:42:35 +00:00
|
|
|
|
|
|
|
|
|
|
|
def parse_and_insert_status(ext_id, date, datepath, con):
|
|
|
|
overview_status = get_overview_status(datepath)
|
|
|
|
crx_status = get_crx_status(datepath)
|
|
|
|
|
|
|
|
overviewexceptionpath = os.path.join(datepath, "overview.html.exception")
|
|
|
|
overview_exception = None
|
|
|
|
if os.path.exists(overviewexceptionpath):
|
|
|
|
with open(overviewexceptionpath) as f:
|
|
|
|
overview_exception = f.read()
|
|
|
|
|
|
|
|
con.execute("INSERT INTO status VALUES (?,?,?,?,?)",
|
|
|
|
(ext_id, date, crx_status, overview_status,
|
|
|
|
overview_exception))
|
|
|
|
|
|
|
|
|
2017-06-22 16:46:18 +00:00
|
|
|
def update_sqlite_incremental(db_path, tmptardir, ext_id, date, verbose,
|
2017-06-16 22:19:13 +00:00
|
|
|
indent):
|
2017-06-16 19:40:48 +00:00
|
|
|
txt = ""
|
2017-06-19 15:42:35 +00:00
|
|
|
indent2 = indent + 4 * " "
|
|
|
|
|
|
|
|
datepath = os.path.join(tmptardir, date)
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-16 22:19:13 +00:00
|
|
|
txt = logmsg(verbose, txt,
|
2017-06-19 17:41:29 +00:00
|
|
|
indent + "- updating with data from {}\n".format(date))
|
2017-06-16 19:40:48 +00:00
|
|
|
|
2017-06-17 16:10:18 +00:00
|
|
|
if not os.path.exists(db_path):
|
2017-06-19 15:42:35 +00:00
|
|
|
txt = logmsg(verbose, txt,
|
|
|
|
indent2 + "* db file does not exist, creating...\n")
|
2017-06-20 22:15:15 +00:00
|
|
|
with SelfclosingSqliteDB(db_path) as con:
|
2017-06-19 15:42:35 +00:00
|
|
|
setup_tables(con)
|
2017-06-16 19:40:48 +00:00
|
|
|
|
2017-06-20 22:15:15 +00:00
|
|
|
with SelfclosingSqliteDB(db_path) as con:
|
2017-06-19 15:42:35 +00:00
|
|
|
parse_and_insert_status(ext_id, date, datepath, con)
|
|
|
|
|
2017-06-20 07:42:30 +00:00
|
|
|
parse_and_insert_overview(ext_id, date, datepath, con, verbose,
|
|
|
|
indent2)
|
2017-06-16 10:06:04 +00:00
|
|
|
|
2017-06-20 07:49:01 +00:00
|
|
|
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent2)
|
|
|
|
txt = logmsg(verbose, txt, etag_msg)
|
2017-06-16 22:19:13 +00:00
|
|
|
etag_already_in_db = next(
|
2017-07-12 12:54:44 +00:00
|
|
|
con.execute("SELECT COUNT(crx_etag) FROM crx WHERE crx_etag=?", (
|
|
|
|
etag, )))[0]
|
2017-06-16 13:56:23 +00:00
|
|
|
|
2017-06-19 15:42:35 +00:00
|
|
|
if etag:
|
|
|
|
if not etag_already_in_db:
|
|
|
|
try:
|
2017-06-20 07:49:01 +00:00
|
|
|
crx_msg = parse_and_insert_crx(ext_id, date, datepath, con,
|
|
|
|
verbose, indent)
|
|
|
|
txt = logmsg(verbose, txt, crx_msg)
|
2017-06-19 15:42:35 +00:00
|
|
|
except zipfile.BadZipfile as e:
|
|
|
|
txt = logmsg(
|
|
|
|
verbose, txt, indent2 +
|
|
|
|
"* WARNING: the found crx file is not a zip file, exception: "
|
|
|
|
)
|
|
|
|
txt = logmsg(verbose, txt, str(e))
|
|
|
|
txt = logmsg(verbose, txt, "\n")
|
|
|
|
else:
|
2017-06-19 17:41:29 +00:00
|
|
|
crx_status = get_crx_status(datepath)
|
2017-06-20 14:07:44 +00:00
|
|
|
if crx_status != 401 and crx_status != 204 and crx_status != 404:
|
2017-06-19 17:41:29 +00:00
|
|
|
txt = logmsg(verbose, txt,
|
|
|
|
indent2 + "* WARNING: could not find etag\n")
|
2017-06-19 15:42:35 +00:00
|
|
|
|
2017-07-12 16:56:40 +00:00
|
|
|
reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
|
2017-06-19 15:42:35 +00:00
|
|
|
for reviewpath in reviewpaths:
|
|
|
|
try:
|
|
|
|
parse_and_insert_review(ext_id, date, reviewpath, con)
|
|
|
|
except json.decoder.JSONDecodeError as e:
|
|
|
|
txt = logmsg(
|
|
|
|
verbose, txt,
|
|
|
|
indent2 + "* Could not parse review file, exception: ")
|
|
|
|
txt = logmsg(verbose, txt, str(e))
|
|
|
|
txt = logmsg(verbose, txt, "\n")
|
2017-07-12 16:56:40 +00:00
|
|
|
|
|
|
|
reviewsrepliespaths = os.path.join(datepath, "reviewsreplies.text")
|
|
|
|
if os.path.exists(reviewsrepliespaths):
|
|
|
|
try:
|
|
|
|
parse_and_insert_review_replies(ext_id, date,
|
|
|
|
reviewsrepliespaths, con)
|
|
|
|
except json.decoder.JSONDecodeError as e:
|
|
|
|
txt = logmsg(
|
|
|
|
verbose, txt, indent2 +
|
|
|
|
"* Could not parse review reply file, exception: ")
|
|
|
|
txt = logmsg(verbose, txt, str(e))
|
|
|
|
txt = logmsg(verbose, txt, "\n")
|
2017-06-16 19:40:48 +00:00
|
|
|
return txt
|