2017-01-23 18:54:32 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
#
|
|
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import glob
|
|
|
|
import re
|
2017-01-27 22:31:21 +00:00
|
|
|
import requests
|
|
|
|
from datetime import datetime, timezone
|
2017-01-26 03:53:16 +00:00
|
|
|
import ExtensionCrawler.discover
|
|
|
|
import ExtensionCrawler.archive
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-28 00:21:37 +00:00
|
|
|
class RequestResult:
|
|
|
|
def __init__(self, response=None, exception=None):
|
|
|
|
if response is not None:
|
|
|
|
self.http_status = response.status_code
|
|
|
|
self.exception = exception
|
|
|
|
def is_ok(self):
|
|
|
|
return (self.exception is None) and (self.http_status is 200)
|
2017-01-28 00:51:21 +00:00
|
|
|
def not_authorized(self):
|
|
|
|
return (self.exception is None) and (self.http_status is 401)
|
|
|
|
def not_found(self):
|
|
|
|
return (self.exception is None) and (self.http_status is 404)
|
|
|
|
def has_exception(self):
|
|
|
|
return self.exception is not None
|
|
|
|
def not_available(self):
|
|
|
|
return (self.exception is None) and (self.http_status is 503)
|
|
|
|
|
2017-01-28 00:21:37 +00:00
|
|
|
class UpdateResult:
|
|
|
|
def __init__(self, id, res_overview, res_crx, res_reviews, res_support):
|
|
|
|
self.id = id
|
|
|
|
self.res_overview=res_overview
|
|
|
|
self.res_crx=res_crx
|
|
|
|
self.res_reviews=res_reviews
|
|
|
|
self.res_support=res_support
|
|
|
|
def is_ok(self):
|
|
|
|
return (self.res_overview.is_ok() and self.res_crx.is_ok()
|
|
|
|
and ((self.res_reviews is None) or self.res_reviews.is_ok())
|
|
|
|
and ((self.res_support is None) or self.res_support.is_ok()))
|
2017-01-28 00:51:21 +00:00
|
|
|
def not_authorized(self):
|
|
|
|
return (self.res_overview.not_authorized() or self.res_crx.not_authorized()
|
|
|
|
or (self.res_reviews is not None and self.res_reviews.not_authorized())
|
|
|
|
or (self.res_support is not None and self.res_support.not_authorized()))
|
|
|
|
def not_in_store(self):
|
|
|
|
return (self.res_overview.not_found() or self.res_crx.not_found()
|
|
|
|
or (self.res_reviews is not None and self.res_reviews.not_found())
|
|
|
|
or (self.res_support is not None and self.res_support.not_found()))
|
|
|
|
def has_exception(self):
|
|
|
|
return (self.res_overview.has_exception() or self.res_crx.has_exception()
|
|
|
|
or (self.res_reviews is not None and self.res_reviews.has_exception())
|
|
|
|
or (self.res_support is not None and self.res_support.has_exception()))
|
|
|
|
def raised_google_ddos(self):
|
|
|
|
return ((self.res_reviews is not None and self.res_reviews.not_available())
|
|
|
|
or (self.res_support is not None and self.res_support.not_available()))
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-01-27 23:16:48 +00:00
|
|
|
def write_text(dir,fname,text):
|
|
|
|
with open(os.path.join(dir, fname), 'w') as f:
|
|
|
|
f.write(text)
|
|
|
|
|
|
|
|
def store_request_metadata(dir,fname, request):
|
|
|
|
write_text(dir, fname + ".headers", str(request.headers))
|
|
|
|
write_text(dir, fname + ".status", str(request.status_code))
|
|
|
|
write_text(dir, fname + ".url", str(request.url))
|
|
|
|
|
|
|
|
def store_request_text(dir,fname,request):
|
|
|
|
write_text(dir,fname,request.text)
|
|
|
|
store_request_metadata(dir,fname,request)
|
|
|
|
|
|
|
|
def log(verbose,msg):
|
2017-01-23 18:54:32 +00:00
|
|
|
if verbose:
|
2017-01-27 23:16:48 +00:00
|
|
|
sys.stdout.write(msg)
|
|
|
|
|
|
|
|
def update_overview(dir, verbose, ext_id):
|
|
|
|
log(verbose," * overview page: ")
|
2017-01-28 00:21:37 +00:00
|
|
|
try:
|
|
|
|
res = requests.get(ExtensionCrawler.config.const_overview_url(ext_id))
|
|
|
|
log(verbose, "{}".format(str(res.status_code)))
|
|
|
|
store_request_text(dir, 'overview.html', res)
|
|
|
|
except Exception as e:
|
|
|
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
2017-01-28 01:00:23 +00:00
|
|
|
write_text(dir, 'overview.html.exception', str(e))
|
2017-01-28 00:21:37 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
log(verbose,"\n")
|
|
|
|
return RequestResult(res)
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-24 22:04:34 +00:00
|
|
|
|
2017-01-23 18:54:32 +00:00
|
|
|
def update_crx(dir, verbose, ext_id):
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose," * crx archive: ")
|
2017-01-28 00:21:37 +00:00
|
|
|
try:
|
|
|
|
res = requests.get(
|
|
|
|
ExtensionCrawler.config.const_download_url().format(ext_id),
|
|
|
|
stream=True)
|
|
|
|
log(verbose,"{}".format(str(res.status_code)))
|
2017-01-27 22:31:21 +00:00
|
|
|
|
2017-01-28 00:21:37 +00:00
|
|
|
extfilename = os.path.basename(res.url)
|
|
|
|
store_request_metadata(dir, extfilename, res)
|
2017-01-27 22:31:21 +00:00
|
|
|
|
2017-01-28 00:21:37 +00:00
|
|
|
with open(os.path.join(dir, extfilename), 'wb') as f:
|
|
|
|
for chunk in res.iter_content(chunk_size=512 * 1024):
|
|
|
|
if chunk: # filter out keep-alive new chunks
|
|
|
|
f.write(chunk)
|
|
|
|
except Exception as e:
|
|
|
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
2017-01-28 01:00:23 +00:00
|
|
|
write_text(dir, extfilename+".exception", str(e))
|
2017-01-28 00:21:37 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
log(verbose,"\n")
|
|
|
|
return RequestResult(res)
|
2017-01-27 22:31:21 +00:00
|
|
|
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-24 22:04:34 +00:00
|
|
|
|
2017-01-23 18:54:32 +00:00
|
|
|
def update_reviews(dir, verbose, ext_id):
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose, " * review page: ")
|
2017-01-28 00:21:37 +00:00
|
|
|
try:
|
|
|
|
res = requests.post(
|
|
|
|
ExtensionCrawler.config.const_review_url(),
|
|
|
|
data=ExtensionCrawler.config.const_review_payload(ext_id, "0", "100"))
|
|
|
|
log(verbose, "{}/".format(str(res.status_code)))
|
|
|
|
store_request_text(dir, 'reviews000-099.text', res)
|
|
|
|
res = requests.post(
|
|
|
|
ExtensionCrawler.config.const_review_url(),
|
|
|
|
data=ExtensionCrawler.config.const_review_payload(ext_id, "0", "100"))
|
|
|
|
log(verbose, "{}".format(str(res.status_code)))
|
|
|
|
store_request_text(dir, 'reviews100-199.text', res)
|
|
|
|
except Exception as e:
|
|
|
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
2017-01-28 01:00:23 +00:00
|
|
|
write_text(dir, 'reviews.html.exception', str(e))
|
2017-01-28 00:21:37 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
log(verbose,"\n")
|
|
|
|
return RequestResult(res)
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-24 22:04:34 +00:00
|
|
|
|
2017-01-23 18:54:32 +00:00
|
|
|
def update_support(dir, verbose, ext_id):
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose, " * support page: ")
|
2017-01-28 00:21:37 +00:00
|
|
|
try:
|
|
|
|
res = requests.post(
|
|
|
|
ExtensionCrawler.config.const_support_url(),
|
|
|
|
data=ExtensionCrawler.config.const_support_payload(ext_id, "0", "100"))
|
|
|
|
log(verbose, "{}/".format(str(res.status_code)))
|
|
|
|
store_request_text(dir, 'support000-099.text', res)
|
|
|
|
res = requests.post(
|
|
|
|
ExtensionCrawler.config.const_support_url(),
|
|
|
|
data=ExtensionCrawler.config.const_support_payload(ext_id, "100",
|
2017-01-27 22:40:07 +00:00
|
|
|
"100"))
|
2017-01-28 00:21:37 +00:00
|
|
|
log(verbose, "{}".format(str(res.status_code)))
|
|
|
|
store_request_text(dir, 'support100-199.text', res)
|
|
|
|
except Exception as e:
|
|
|
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
2017-01-28 01:00:23 +00:00
|
|
|
write_text(dir, 'support.html.exception', str(e))
|
2017-01-28 00:21:37 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
log(verbose,"\n")
|
|
|
|
return RequestResult(res)
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-24 22:04:34 +00:00
|
|
|
|
2017-01-25 19:43:10 +00:00
|
|
|
def update_extension(archivedir, verbose, forums, ext_id):
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose, " Updating {}".format(ext_id))
|
|
|
|
if forums:
|
|
|
|
log(verbose, " (including forums)")
|
|
|
|
log(verbose, "\n")
|
2017-01-27 22:31:21 +00:00
|
|
|
date = datetime.now(timezone.utc).isoformat()
|
|
|
|
dir = os.path.join(
|
|
|
|
os.path.join(archivedir,
|
|
|
|
ExtensionCrawler.archive.get_local_archive_dir(ext_id)),
|
|
|
|
date)
|
2017-01-23 18:54:32 +00:00
|
|
|
os.makedirs(dir, exist_ok=True)
|
2017-01-28 00:21:37 +00:00
|
|
|
res_overview=update_overview(dir, verbose, ext_id)
|
|
|
|
res_crx=update_crx(dir, verbose, ext_id)
|
|
|
|
res_reviews=None
|
|
|
|
res_support=None
|
2017-01-23 18:54:32 +00:00
|
|
|
if forums:
|
2017-01-28 00:21:37 +00:00
|
|
|
res_reviews=update_reviews(dir, verbose, ext_id)
|
|
|
|
res_support=update_support(dir, verbose, ext_id)
|
|
|
|
return UpdateResult(ext_id,res_overview, res_crx, res_reviews, res_support)
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-27 22:31:21 +00:00
|
|
|
def update_extensions(archivedir, verbose, forums_ext_ids, known_ext_ids,
|
|
|
|
new_ext_ids):
|
2017-01-23 18:54:32 +00:00
|
|
|
def update_forums(ext_id):
|
|
|
|
return (ext_id in forums_ext_ids)
|
2017-01-27 22:31:21 +00:00
|
|
|
|
2017-01-26 03:53:16 +00:00
|
|
|
ext_ids = known_ext_ids + new_ext_ids
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose,
|
2017-01-27 22:31:21 +00:00
|
|
|
"Updating {} extensions ({} new, {} including forums)\n".format(
|
|
|
|
len(ext_ids), len(new_ext_ids), len(forums_ext_ids)))
|
2017-01-26 03:53:16 +00:00
|
|
|
return list(map(lambda ext_id: update_extension(archivedir, verbose,
|
|
|
|
update_forums(ext_id), ext_id),
|
|
|
|
ext_ids))
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-24 22:04:34 +00:00
|
|
|
|
2017-01-25 19:43:10 +00:00
|
|
|
def get_existing_ids(archivedir, verbose):
|
2017-01-24 22:04:34 +00:00
|
|
|
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
|
|
|
|
word = byte + byte + byte + byte
|
|
|
|
return list(
|
|
|
|
map(lambda d: re.sub("^.*\/", "", d),
|
2017-01-25 19:43:10 +00:00
|
|
|
glob.glob(os.path.join(archivedir, "*", word))))
|
2017-01-23 18:54:32 +00:00
|
|
|
|
|
|
|
|
2017-01-24 22:04:34 +00:00
|
|
|
def get_forum_ext_ids(confdir, verbose):
|
|
|
|
with open(os.path.join(confdir, "forums.conf")) as f:
|
|
|
|
ids = f.readlines()
|
|
|
|
ids = [x.strip() for x in ids]
|
|
|
|
return ids
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-26 03:53:16 +00:00
|
|
|
|
|
|
|
def get_new_ids(verbose, known_ids):
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose, "Discovering new ids ... \n")
|
2017-01-26 03:53:16 +00:00
|
|
|
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
|
2017-01-26 03:16:47 +00:00
|
|
|
new_ids = list(set(discovered_ids) - set(known_ids))
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose, " Discovered {} new extensions (out of {})\n".format(
|
2017-01-26 03:53:16 +00:00
|
|
|
len(new_ids), len(discovered_ids)))
|
2017-01-26 03:16:47 +00:00
|
|
|
return new_ids
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-28 00:21:37 +00:00
|
|
|
def log_summary(verbose,res):
|
2017-01-28 00:51:21 +00:00
|
|
|
total = len(res)
|
|
|
|
success = len(list(filter(lambda x: x.is_ok(), res)))
|
|
|
|
not_authorized = len(list(filter(lambda x: x.not_authorized(), res)))
|
|
|
|
has_exception = len(list(filter(lambda x: x.has_exception(), res)))
|
|
|
|
raised_ddos = len(list(filter(lambda x: x.raised_google_ddos(), res)))
|
|
|
|
not_in_store = len(list(filter(lambda x: x.not_in_store(), res)))
|
2017-01-28 00:21:37 +00:00
|
|
|
log(verbose, "\n")
|
|
|
|
log(verbose, "Summary:\n")
|
|
|
|
log(verbose, " Updated {} out of {} extensions successfully\n".format(str(success),str(total)))
|
2017-01-28 00:51:21 +00:00
|
|
|
log(verbose, " Not authorized: {}\n".format(str(not_authorized)))
|
|
|
|
log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos)))
|
|
|
|
log(verbose, " Extenion not in store: {}\n".format(str(not_in_store)))
|
|
|
|
log(verbose, " Unknown exception: {}\n".format(str(has_exception)))
|
2017-01-28 00:21:37 +00:00
|
|
|
|
2017-01-26 03:53:16 +00:00
|
|
|
|
2017-01-23 18:54:32 +00:00
|
|
|
def main():
|
2017-01-25 19:43:10 +00:00
|
|
|
basedir = "."
|
2017-01-26 03:53:16 +00:00
|
|
|
archive_dir = os.path.join(basedir, "archive")
|
|
|
|
conf_dir = os.path.join(basedir, "conf")
|
2017-01-24 22:04:34 +00:00
|
|
|
verbose = True
|
2017-01-26 03:53:16 +00:00
|
|
|
skip_discovery = True
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-27 23:16:48 +00:00
|
|
|
log(verbose, "Configuration:\n")
|
|
|
|
log(verbose, " Base dir: {}\n".format(basedir))
|
|
|
|
log(verbose, " Archive dir: {}\n".format(archive_dir))
|
|
|
|
log(verbose, " Conf. dir: {}\n".format(conf_dir))
|
|
|
|
log(verbose, " Skip discovery: {}\n".format(skip_discovery))
|
2017-01-28 01:00:23 +00:00
|
|
|
log(verbose, "\n".format(skip_discovery))
|
2017-01-27 22:31:21 +00:00
|
|
|
|
2017-01-26 03:53:16 +00:00
|
|
|
forum_ext_ids = get_forum_ext_ids(conf_dir, verbose)
|
|
|
|
existing_ids = get_existing_ids(archive_dir, verbose)
|
|
|
|
known_ids = list(set(existing_ids) | set(forum_ext_ids))
|
|
|
|
new_ids = []
|
|
|
|
if not skip_discovery:
|
|
|
|
new_ids = get_new_ids(verbose, known_ids)
|
|
|
|
|
2017-01-28 00:21:37 +00:00
|
|
|
res=update_extensions(archive_dir, verbose, forum_ext_ids, existing_ids,
|
|
|
|
new_ids)
|
2017-01-23 18:54:32 +00:00
|
|
|
|
2017-01-28 00:21:37 +00:00
|
|
|
log_summary(verbose, res)
|
2017-01-23 18:54:32 +00:00
|
|
|
|
|
|
|
main()
|