ExtensionCrawler/crawler

356 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sys
import glob
import re
import requests
from time import sleep
from random import randint
from datetime import datetime, timezone
from ExtensionCrawler.discover import *
from ExtensionCrawler.archive import *
from ExtensionCrawler.util import *
import dateutil
import dateutil.parser
class Error(Exception):
pass
class CrawlError(Error):
def __init__(self, extid, message, pagecontent=""):
self.extid = extid
self.message = message
self.pagecontent = pagecontent
class RequestResult:
def __init__(self, response=None, exception=None):
if response is not None:
self.http_status = response.status_code
self.exception = exception
def is_ok(self):
return (self.exception is None) and (self.http_status==200)
def not_authorized(self):
return (self.exception is None) and (self.http_status==401)
def not_found(self):
return (self.exception is None) and (self.http_status==404)
def has_exception(self):
return self.exception is not None
def not_available(self):
return (self.exception is None) and (self.http_status==503)
def not_modified(self):
return ((self.exception is None) and (self.http_status==304))
class UpdateResult:
def __init__(self, id, res_overview, res_crx, res_reviews, res_support):
self.id = id
self.res_overview = res_overview
self.res_crx = res_crx
self.res_reviews = res_reviews
self.res_support = res_support
def is_ok(self):
return (self.res_overview.is_ok() and (self.res_crx.is_ok() or self.res_crx.not_modified()) and (
(self.res_reviews is None) or self.res_reviews.is_ok()) and (
(self.res_support is None) or self.res_support.is_ok()))
def not_authorized(self):
return (self.res_overview.not_authorized() or
self.res_crx.not_authorized() or
(self.res_reviews is not None and
self.res_reviews.not_authorized()) or (
self.res_support is not None and
self.res_support.not_authorized()))
def not_in_store(self):
return (
self.res_overview.not_found() or self.res_crx.not_found() or
(self.res_reviews is not None and self.res_reviews.not_found()) or
(self.res_support is not None and self.res_support.not_found()))
def has_exception(self):
return (
self.res_overview.has_exception() or
self.res_crx.has_exception() or
(self.res_reviews is not None and self.res_reviews.has_exception())
or (self.res_support is not None and
self.res_support.has_exception()))
def raised_google_ddos(self):
return (
(self.res_reviews is not None and self.res_reviews.not_available())
or (self.res_support is not None and
self.res_support.not_available()))
def not_modified(self):
return self.res_crx.not_modified()
def google_dos_protection(max=3):
sleep(randint(1, max) * .5)
def log(verbose, msg):
if verbose:
sys.stdout.write(msg)
def update_overview(dir, verbose, ext_id):
log(verbose, " * overview page: ")
try:
res = requests.get(ExtensionCrawler.config.const_overview_url(ext_id))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'overview.html', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'overview.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def validate_crx_response(res, extfilename):
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
if not 'Content-Type' in res.headers:
raise CrawlError(extid, 'Did not find Content-Type header.',
'\n'.join(res.iter_lines()))
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
text = [line.decode('utf-8') for line in res.iter_lines()]
raise CrawlError(
extid,
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
format(res.headers['Content-Type']), '\n'.join(text))
if not regex_extfilename.match(extfilename):
raise CrawlError(
extid, '{} is not a valid extension file name, skipping...'.format(
extfilename))
def valueOf(value, default):
if value is not None and value is not "":
return value
else:
return default
def update_crx(dir, verbose, ext_id):
last_crx_file = last_crx(dir, ext_id)
last_crx_http_date = last_modified_http_date(last_crx_file)
log(verbose, " * crx archive (Last: {}): ".format(
valueOf(last_crx_http_date, "n/a")))
headers = ""
if last_crx_file is not "":
headers = {'If-Modified-Since': last_crx_http_date}
try:
res = requests.get(
ExtensionCrawler.config.const_download_url().format(ext_id),
stream=True,
headers=headers)
log(verbose, "{}".format(str(res.status_code)))
extfilename = os.path.basename(res.url)
store_request_metadata(dir, extfilename, res)
if res.status_code == 304:
write_text(dir, extfilename + ".link",
os.path.join("..",
last_modified_utc_date(last_crx_file),
extfilename) + "\n")
else:
validate_crx_response(res, extfilename)
with open(os.path.join(dir, extfilename), 'wb') as f:
for chunk in res.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, extfilename + ".exception", str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_reviews(dir, verbose, ext_id):
log(verbose, " * review page: ")
res = None
try:
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_review_url(),
data=ExtensionCrawler.config.const_review_payload(ext_id, "0",
"100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'reviews000-099.text', res)
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_review_url(),
data=ExtensionCrawler.config.const_review_payload(ext_id, "0",
"100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'reviews100-199.text', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'reviews.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_support(dir, verbose, ext_id):
log(verbose, " * support page: ")
res = None
try:
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_support_url(),
data=ExtensionCrawler.config.const_support_payload(ext_id, "0",
"100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'support000-099.text', res)
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_support_url(),
data=ExtensionCrawler.config.const_support_payload(ext_id, "100",
"100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'support100-199.text', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'support.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_extension(archivedir, verbose, forums, ext_id):
log(verbose, " Updating {}".format(ext_id))
if forums:
log(verbose, " (including forums)")
log(verbose, "\n")
date = datetime.now(timezone.utc).isoformat()
dir = os.path.join(
os.path.join(archivedir,
ExtensionCrawler.archive.get_local_archive_dir(ext_id)),
date)
os.makedirs(dir, exist_ok=True)
res_overview = update_overview(dir, verbose, ext_id)
res_crx = update_crx(dir, verbose, ext_id)
res_reviews = None
res_support = None
if forums:
res_reviews = update_reviews(dir, verbose, ext_id)
res_support = update_support(dir, verbose, ext_id)
return UpdateResult(ext_id, res_overview, res_crx, res_reviews,
res_support)
def update_extensions(archivedir, verbose, forums_ext_ids, known_ext_ids,
new_ext_ids):
def update_forums(ext_id):
return (ext_id in forums_ext_ids)
ext_ids = known_ext_ids + new_ext_ids
log(verbose,
"Updating {} extensions ({} new, {} including forums)\n".format(
len(ext_ids), len(new_ext_ids), len(forums_ext_ids)))
return list(map(lambda ext_id: update_extension(archivedir, verbose,
update_forums(ext_id), ext_id),
ext_ids))
def get_existing_ids(archivedir, verbose):
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
word = byte + byte + byte + byte
return list(
map(lambda d: re.sub("^.*\/", "", d),
glob.glob(os.path.join(archivedir, "*", word))))
def get_forum_ext_ids(confdir, verbose):
with open(os.path.join(confdir, "forums.conf")) as f:
ids = f.readlines()
ids = [x.strip() for x in ids]
return ids
def get_new_ids(verbose, known_ids):
log(verbose, "Discovering new ids ... \n")
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
new_ids = list(set(discovered_ids) - set(known_ids))
log(verbose, " Discovered {} new extensions (out of {})\n".format(
len(new_ids), len(discovered_ids)))
return new_ids
def log_summary(verbose, res):
total = len(res)
success = len(list(filter(lambda x: x.is_ok(), res)))
not_authorized = len(list(filter(lambda x: x.not_authorized(), res)))
has_exception = len(list(filter(lambda x: x.has_exception(), res)))
raised_ddos = len(list(filter(lambda x: x.raised_google_ddos(), res)))
not_in_store = len(list(filter(lambda x: x.not_in_store(), res)))
not_modified = len(list(filter(lambda x: x.not_modified(), res)))
log(verbose, "\n")
log(verbose, "Summary:\n")
log(verbose, " Updated {} out of {} extensions successfully\n".format(
str(success), str(total)))
log(verbose, " Not authorized: {}\n".format(str(not_authorized)))
log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos)))
log(verbose, " Updated archives: {}\n".format(str(success-not_modified)))
log(verbose, " Extensions not in store: {}\n".format(str(not_in_store)))
log(verbose, " Unknown exception: {}\n".format(str(has_exception)))
def main():
basedir = "."
archive_dir = os.path.join(basedir, "archive")
conf_dir = os.path.join(basedir, "conf")
verbose = True
skip_discovery = True
log(verbose, "Configuration:\n")
log(verbose, " Base dir: {}\n".format(basedir))
log(verbose, " Archive dir: {}\n".format(archive_dir))
log(verbose, " Conf. dir: {}\n".format(conf_dir))
log(verbose, " Skip discovery: {}\n".format(skip_discovery))
log(verbose, "\n".format(skip_discovery))
forum_ext_ids = get_forum_ext_ids(conf_dir, verbose)
existing_ids = get_existing_ids(archive_dir, verbose)
known_ids = list(set(existing_ids) | set(forum_ext_ids))
new_ids = []
if not skip_discovery:
new_ids = get_new_ids(verbose, known_ids)
res = update_extensions(archive_dir, verbose, forum_ext_ids, existing_ids,
new_ids)
log_summary(verbose, res)
main()