ExtensionCrawler/ExtensionCrawler/sqlite.py

462 lines
18 KiB
Python
Raw Normal View History

2017-06-16 10:06:04 +00:00
#
# Copyright (C) 2017 The University of Sheffield, UK
2017-06-16 13:56:23 +00:00
#
2017-06-16 10:06:04 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
2017-06-16 13:56:23 +00:00
from ExtensionCrawler.crx import *
2017-06-19 04:57:17 +00:00
from ExtensionCrawler.archive import *
2017-06-16 13:56:23 +00:00
import sqlite3
import re
from bs4 import BeautifulSoup
2017-07-07 18:47:14 +00:00
import jsbeautifier
2017-06-16 13:56:23 +00:00
from zipfile import ZipFile
import json
2017-06-16 19:40:48 +00:00
import os
import glob
2017-06-16 13:56:23 +00:00
2017-06-20 22:15:15 +00:00
class SelfclosingSqliteDB:
def __init__(self, filename):
self.filename = filename
2017-06-20 22:15:15 +00:00
def __enter__(self):
self.con = sqlite3.connect(self.filename)
return self.con
2017-06-20 22:15:15 +00:00
def __exit__(self, *args):
self.con.commit()
2017-06-20 22:15:15 +00:00
self.con.close()
2017-06-16 21:38:48 +00:00
2017-06-19 15:42:35 +00:00
def setup_tables(con):
2017-07-12 16:04:56 +00:00
con.execute("""CREATE VIRTUAL TABLE review using fts4("""
"""author TEXT PRIMARY KEY,"""
2017-06-19 15:42:35 +00:00
"""extid TEXT,"""
"""date TEXT,"""
"""displayname TEXT,"""
"""reviewdate INTEGER,"""
"""rating INTEGER,"""
"""language TEXT,"""
"""shortauthor TEXT,"""
"""comment TEXT"""
""")""")
con.execute("""CREATE VIRTUAL TABLE reviewreplies using fts4("""
"""author TEXT PRIMARY KEY,"""
"""extid TEXT,"""
"""date TEXT,"""
"""displayname TEXT,"""
"""reviewdate INTEGER,"""
"""replyto TEXT,"""
"""language TEXT,"""
"""shortauthor TEXT,"""
"""comment TEXT"""
""")""")
2017-06-19 15:42:35 +00:00
con.execute("""CREATE TABLE category ("""
"""extid TEXT,"""
"""date TEXT,"""
"""category TEXT,"""
"""PRIMARY KEY (extid, date, category)"""
""")""")
2017-07-07 19:09:22 +00:00
con.execute("""CREATE TABLE content_script_url ("""
"""crx_etag TEXT,"""
"""url TEXT,"""
"""PRIMARY KEY (crx_etag, url)"""
""")""")
2017-06-19 15:42:35 +00:00
con.execute("""CREATE TABLE permission ("""
"""crx_etag TEXT,"""
"""permission TEXT,"""
"""PRIMARY KEY (crx_etag, permission)"""
""")""")
con.execute("""CREATE TABLE crx ("""
2017-07-10 11:46:41 +00:00
"""crx_etag TEXT PRIMARY KEY,"""
2017-06-19 15:42:35 +00:00
"""filename TEXT,"""
2017-07-07 18:47:14 +00:00
"""size INTEGER,"""
"""jsloc INTEGER,"""
2017-06-19 15:42:35 +00:00
"""publickey BLOB"""
""")""")
con.execute("""CREATE TABLE status ("""
"""extid TEXT,"""
"""date TEXT,"""
"""crx_status INTEGER,"""
"""overview_status INTEGER,"""
"""overview_exception TEXT,"""
"""PRIMARY KEY (extid, date)"""
""")""")
con.execute("""CREATE TABLE extension ("""
"""extid TEXT,"""
"""date TEXT,"""
"""name TEXT,"""
"""version TEXT,"""
"""description TEXT,"""
"""downloads INTEGER,"""
"""rating REAL,"""
"""ratingcount INTEGER,"""
2017-06-19 15:42:35 +00:00
"""fulldescription TEXT,"""
"""developer TEXT,"""
2017-07-07 18:29:51 +00:00
"""itemcategory TEXT,"""
2017-06-19 15:42:35 +00:00
"""crx_etag TEXT,"""
"""lastupdated TEXT,"""
"""PRIMARY KEY (extid, date),"""
2017-07-10 11:46:41 +00:00
"""FOREIGN KEY (crx_etag) REFERENCES crx(crx_etag)"""
2017-06-19 15:42:35 +00:00
""")""")
2017-06-16 13:56:23 +00:00
2017-06-16 22:19:13 +00:00
def get_etag(ext_id, datepath, con, verbose, indent):
txt = ""
# Trying to parse etag file
etagpath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.etag"))), None)
if etagpath:
with open(etagpath) as f:
return f.read(), txt
# Trying to parse header file for etag
headerpath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
if headerpath:
with open(headerpath) as f:
content = f.read()
try:
headers = eval(content)
if "ETag" in headers:
return headers["ETag"], txt
except Exception:
txt = logmsg(
verbose, txt,
indent + "* WARNING: could not parse crx header file")
pass
# Trying to look up previous etag in database
linkpath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
if linkpath:
with open(linkpath) as f:
link = f.read()
linked_date = link[3:].split("/")[0]
row = next(
con.execute(
"SELECT crx_etag FROM extension WHERE extid=? AND date=?",
(ext_id, linked_date)), None)
if row:
return row[0], txt
return None, txt
2017-06-16 22:19:13 +00:00
2017-06-16 19:40:48 +00:00
def get_overview_status(datepath):
overviewstatuspath = os.path.join(datepath, "overview.html.status")
if os.path.exists(overviewstatuspath):
with open(overviewstatuspath) as f:
2017-06-20 07:43:43 +00:00
return int(f.read())
2017-06-16 22:19:13 +00:00
2017-06-16 19:40:48 +00:00
def get_crx_status(datepath):
statuspath = next(
iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None)
if statuspath:
with open(statuspath) as f:
2017-06-20 07:43:43 +00:00
return int(f.read())
2017-06-16 19:40:48 +00:00
2017-06-19 17:41:29 +00:00
# If the extension is paid, we will find a main.headers file...
statuspath = os.path.join(datepath, "main.status")
if os.path.exists(statuspath):
with open(statuspath) as f:
return int(f.read())
# ... or an default.crx.headers file
statuspath = os.path.join(datepath, "default.crx.status")
if os.path.exists(statuspath):
with open(statuspath) as f:
return int(f.read())
2017-06-16 22:19:13 +00:00
def parse_and_insert_overview(ext_id, date, datepath, con, verbose, indent):
txt = ""
overview_path = os.path.join(datepath, "overview.html")
2017-06-19 15:42:35 +00:00
if os.path.exists(overview_path):
with open(overview_path) as overview_file:
contents = overview_file.read()
2017-06-16 19:40:48 +00:00
2017-06-19 15:42:35 +00:00
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
contents)
name = match.group(1) if match else None
2017-06-16 19:40:48 +00:00
2017-06-19 15:42:35 +00:00
# Extract extension version
match = re.search(
"""<meta itemprop="version" content="(.*?)"\s*/>""", contents)
version = match.group(1) if match else None
2017-06-16 13:56:23 +00:00
match = re.search(
"""<meta itemprop="ratingValue" content="(.*?)"\s*/>""",
contents)
rating = float(match.group(1)) if match else None
match = re.search(
"""<meta itemprop="ratingCount" content="(.*?)"\s*/>""",
contents)
rating_count = int(match.group(1)) if match else None
2017-06-19 15:42:35 +00:00
# Extracts extension categories
match = re.search(
"""Attribute name="category">(.+?)</Attribute>""", contents)
categories = match.group(1).split(",") if match else None
2017-06-16 13:56:23 +00:00
2017-06-19 15:42:35 +00:00
# Extracts the number of downloads
match = re.search(
2017-07-07 18:29:51 +00:00
"""<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
contents)
downloads = int(match.group(1).replace(",", '')) if match else None
2017-06-16 13:56:23 +00:00
# Extracts the full extension description as it appears on the
# overview page
2017-06-19 15:42:35 +00:00
doc = BeautifulSoup(contents, 'html.parser')
2017-06-16 13:56:23 +00:00
2017-06-19 15:42:35 +00:00
description_parent = doc.find('div', itemprop="description")
description = str(
description_parent.contents[0]
) if description_parent and description_parent.contents else None
full_description = str(
description_parent.parent) if description_parent else None
2017-06-16 13:56:23 +00:00
2017-06-19 15:42:35 +00:00
developer_parent = doc.find(
class_=lambda cls: cls and "e-f-Me" in cls)
developer = "".join([str(x) for x in developer_parent.contents
]) if developer_parent else None
2017-06-16 19:40:48 +00:00
2017-06-19 15:42:35 +00:00
last_updated_parent = doc.find(
class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
last_updated = str(last_updated_parent.contents[
0]) if last_updated_parent else None
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
2017-06-20 07:49:01 +00:00
txt = logmsg(verbose, txt, etag_msg)
2017-06-16 13:56:23 +00:00
2017-07-07 18:29:51 +00:00
match = re.search(
"""<Attribute name="item_category">(.*?)</Attribute>""",
contents)
itemcategory = match.group(1) if match else None
con.execute(
2017-07-07 18:29:51 +00:00
"INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(ext_id, date, name, version, description, downloads, rating,
2017-07-07 18:29:51 +00:00
rating_count, full_description, developer, itemcategory, etag,
last_updated))
2017-06-19 15:42:35 +00:00
if categories:
for category in categories:
con.execute("INSERT INTO category VALUES (?,?,?)",
(ext_id, date, category))
2017-06-16 13:56:23 +00:00
return txt
2017-06-16 13:56:23 +00:00
2017-06-20 07:49:01 +00:00
def parse_and_insert_crx(ext_id, date, datepath, con, verbose, indent):
txt = ""
2017-06-19 15:42:35 +00:00
crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
if crx_path:
filename = os.path.basename(crx_path)
with ZipFile(crx_path) as f:
2017-06-20 07:49:01 +00:00
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent)
txt = logmsg(verbose, txt, etag_msg)
2017-06-19 15:42:35 +00:00
with f.open("manifest.json") as m:
raw_content = m.read()
# There are some manifests that seem to have weird encodings...
2017-06-19 15:42:35 +00:00
try:
content = raw_content.decode("utf-8-sig")
except UnicodeDecodeError:
# Trying a different encoding, manifests are weird...
content = raw_content.decode("latin1")
# Attempt to remove JavaScript-style comments from json
comment_regex = re.compile(r'\s*//.*')
multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/')
lines = content.splitlines()
for index, line in enumerate(lines):
if comment_regex.match(
line) or multiline_comment_regex.match(line):
lines[index] = ""
content = "\n".join(lines)
2017-06-20 16:45:13 +00:00
manifest = json.loads(content, strict=False)
if "permissions" in manifest:
for permission in manifest["permissions"]:
con.execute(
"INSERT OR REPLACE INTO permission VALUES (?,?)",
(etag, str(permission)))
2017-07-07 19:09:22 +00:00
if "content_scripts" in manifest:
for csd in manifest["content_scripts"]:
if "matches" in csd:
for urlpattern in csd["matches"]:
con.execute(
"INSERT OR REPLACE INTO content_script_url VALUES (?,?)",
(etag, str(urlpattern)))
2017-06-19 15:42:35 +00:00
2017-07-07 18:47:14 +00:00
size = os.path.getsize(crx_path)
jsloc = 0
jsfiles = filter(lambda x: x.filename.endswith(".js"),
f.infolist())
for jsfile in jsfiles:
with f.open(jsfile) as jsf:
content = jsf.read().decode(errors="surrogateescape")
2017-07-07 18:47:14 +00:00
beautified = jsbeautifier.beautify(content)
lines = beautified.splitlines()
jsloc += len(lines)
2017-06-19 15:42:35 +00:00
public_key = read_crx(crx_path).pk
2017-07-07 18:47:14 +00:00
con.execute("INSERT INTO crx VALUES (?,?,?,?,?)",
(etag, filename, size, jsloc, public_key))
2017-06-20 07:49:01 +00:00
return txt
2017-06-19 15:42:35 +00:00
def get(d, k):
if d and k in d:
return d[k]
def parse_and_insert_review(ext_id, date, reviewpath, con):
with open(reviewpath) as f:
content = f.read()
stripped = content[content.find('{"'):]
d = json.JSONDecoder().raw_decode(stripped)
annotations = get(next(iter(d), None), "annotations")
if annotations:
for review in d[0]["annotations"]:
timestamp = get(review, "timestamp")
starRating = get(review, "starRating")
comment = get(review, "comment")
displayname = get(get(review, "entity"), "displayName")
author = get(get(review, "entity"), "author")
language = get(review, "language")
shortauthor = get(get(review, "entity"), "shortAuthor")
con.execute("INSERT INTO review VALUES(?,?,?,?,?,?,?,?,?)",
(author, ext_id, date, displayname, timestamp,
starRating, language, shortauthor, comment))
def parse_and_insert_review_replies(ext_id, date, reviewsrepliespaths, con):
with open(reviewsrepliespaths) as f:
d = json.load(f)
for result in d["searchResults"]:
for annotation in result["annotations"]:
timestamp = get(annotation, "timestamp")
replyto = get(
get(get(annotation, "entity"), "annotation"), "author")
comment = get(annotation, "comment")
displayname = get(get(annotation, "entity"), "displayName")
author = get(get(annotation, "entity"), "author")
language = get(annotation, "language")
shortauthor = get(get(annotation, "entity"), "shortAuthor")
con.execute(
"INSERT INTO reviewreplies VALUES(?,?,?,?,?,?,?,?,?)",
(author, ext_id, date, displayname, timestamp, replyto,
language, shortauthor, comment))
2017-06-19 15:42:35 +00:00
def parse_and_insert_status(ext_id, date, datepath, con):
overview_status = get_overview_status(datepath)
crx_status = get_crx_status(datepath)
overviewexceptionpath = os.path.join(datepath, "overview.html.exception")
overview_exception = None
if os.path.exists(overviewexceptionpath):
with open(overviewexceptionpath) as f:
overview_exception = f.read()
con.execute("INSERT INTO status VALUES (?,?,?,?,?)",
(ext_id, date, crx_status, overview_status,
overview_exception))
2017-06-22 16:46:18 +00:00
def update_sqlite_incremental(db_path, tmptardir, ext_id, date, verbose,
2017-06-16 22:19:13 +00:00
indent):
2017-06-16 19:40:48 +00:00
txt = ""
2017-06-19 15:42:35 +00:00
indent2 = indent + 4 * " "
datepath = os.path.join(tmptardir, date)
2017-06-16 13:56:23 +00:00
2017-06-16 22:19:13 +00:00
txt = logmsg(verbose, txt,
2017-06-19 17:41:29 +00:00
indent + "- updating with data from {}\n".format(date))
2017-06-16 19:40:48 +00:00
if not os.path.exists(db_path):
2017-06-19 15:42:35 +00:00
txt = logmsg(verbose, txt,
indent2 + "* db file does not exist, creating...\n")
2017-06-20 22:15:15 +00:00
with SelfclosingSqliteDB(db_path) as con:
2017-06-19 15:42:35 +00:00
setup_tables(con)
2017-06-16 19:40:48 +00:00
2017-06-20 22:15:15 +00:00
with SelfclosingSqliteDB(db_path) as con:
2017-06-19 15:42:35 +00:00
parse_and_insert_status(ext_id, date, datepath, con)
parse_and_insert_overview(ext_id, date, datepath, con, verbose,
indent2)
2017-06-16 10:06:04 +00:00
2017-06-20 07:49:01 +00:00
etag, etag_msg = get_etag(ext_id, datepath, con, verbose, indent2)
txt = logmsg(verbose, txt, etag_msg)
2017-06-16 22:19:13 +00:00
etag_already_in_db = next(
con.execute("SELECT COUNT(crx_etag) FROM crx WHERE crx_etag=?", (
etag, )))[0]
2017-06-16 13:56:23 +00:00
2017-06-19 15:42:35 +00:00
if etag:
if not etag_already_in_db:
try:
2017-06-20 07:49:01 +00:00
crx_msg = parse_and_insert_crx(ext_id, date, datepath, con,
verbose, indent)
txt = logmsg(verbose, txt, crx_msg)
2017-06-19 15:42:35 +00:00
except zipfile.BadZipfile as e:
txt = logmsg(
verbose, txt, indent2 +
"* WARNING: the found crx file is not a zip file, exception: "
)
txt = logmsg(verbose, txt, str(e))
txt = logmsg(verbose, txt, "\n")
else:
2017-06-19 17:41:29 +00:00
crx_status = get_crx_status(datepath)
if crx_status != 401 and crx_status != 204 and crx_status != 404:
2017-06-19 17:41:29 +00:00
txt = logmsg(verbose, txt,
indent2 + "* WARNING: could not find etag\n")
2017-06-19 15:42:35 +00:00
reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
2017-06-19 15:42:35 +00:00
for reviewpath in reviewpaths:
try:
parse_and_insert_review(ext_id, date, reviewpath, con)
except json.decoder.JSONDecodeError as e:
txt = logmsg(
verbose, txt,
indent2 + "* Could not parse review file, exception: ")
txt = logmsg(verbose, txt, str(e))
txt = logmsg(verbose, txt, "\n")
reviewsrepliespaths = os.path.join(datepath, "reviewsreplies.text")
if os.path.exists(reviewsrepliespaths):
try:
parse_and_insert_review_replies(ext_id, date,
reviewsrepliespaths, con)
except json.decoder.JSONDecodeError as e:
txt = logmsg(
verbose, txt, indent2 +
"* Could not parse review reply file, exception: ")
txt = logmsg(verbose, txt, str(e))
txt = logmsg(verbose, txt, "\n")
2017-06-16 19:40:48 +00:00
return txt