ExtensionCrawler/ExtensionCrawler/archive.py

610 lines
22 KiB
Python
Raw Normal View History

2017-01-28 12:52:18 +00:00
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
2017-06-16 19:40:48 +00:00
#
2017-01-28 12:52:18 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
2017-01-28 12:52:18 +00:00
#
"""
Module for handling archives of the Browser Extension Crawler.
"""
2017-01-28 12:52:18 +00:00
import os
import glob
import re
import json
2017-01-28 16:37:44 +00:00
from multiprocessing import Pool
from functools import partial
import shutil
2017-03-31 16:04:51 +00:00
import tempfile
import time
import traceback
import tarfile
import datetime
import dateutil
import dateutil.parser
import requests
from ExtensionCrawler.config import (const_review_payload, const_review_search_url,
const_download_url, get_local_archive_dir,
const_overview_url, const_support_url,
const_support_payload, const_review_search_payload,
const_review_url)
from ExtensionCrawler.util import logmsg, google_dos_protection, log, value_of
from ExtensionCrawler.sqlite import db_file, update_sqlite_incremental
2017-04-02 08:23:13 +00:00
2017-07-05 07:21:40 +00:00
2017-01-28 13:03:40 +00:00
class Error(Exception):
pass
class CrawlError(Error):
def __init__(self, extid, message, pagecontent=""):
self.extid = extid
self.message = message
self.pagecontent = pagecontent
super(CrawlError, self).__init__()
2017-01-28 13:03:40 +00:00
class RequestResult:
def __init__(self, response=None, exception=None):
if response is not None:
self.http_status = response.status_code
self.exception = exception
def is_ok(self):
2017-01-28 13:15:05 +00:00
return (self.exception is None) and (self.http_status == 200)
2017-01-28 13:03:40 +00:00
def not_authorized(self):
2017-01-28 13:15:05 +00:00
return (self.exception is None) and (self.http_status == 401)
2017-01-28 13:03:40 +00:00
def not_found(self):
2017-01-28 13:15:05 +00:00
return (self.exception is None) and (self.http_status == 404)
2017-01-28 13:03:40 +00:00
def has_exception(self):
return self.exception is not None
def not_available(self):
2017-01-28 13:15:05 +00:00
return (self.exception is None) and (self.http_status == 503)
2017-01-28 13:03:40 +00:00
def not_modified(self):
return (self.exception is None) and (self.http_status == 304)
2017-01-28 13:03:40 +00:00
class UpdateResult:
2017-03-25 06:31:48 +00:00
def __init__(self, id, is_new, exception, res_overview, res_crx,
2017-06-19 15:42:35 +00:00
res_reviews, res_support, res_sql, sql_update):
2017-01-28 13:03:40 +00:00
self.id = id
self.new = is_new
self.exception = exception
2017-01-28 13:03:40 +00:00
self.res_overview = res_overview
self.res_crx = res_crx
self.res_reviews = res_reviews
self.res_support = res_support
self.res_sql = res_sql
self.sql_update = sql_update
2017-01-28 13:03:40 +00:00
def is_new(self):
return self.new
2017-02-05 14:11:09 +00:00
2017-01-28 13:03:40 +00:00
def is_ok(self):
2017-01-28 13:15:05 +00:00
return (self.res_overview.is_ok() and
(self.res_crx.is_ok() or self.res_crx.not_modified()) and
((self.res_reviews is None) or self.res_reviews.is_ok()) and (
(self.res_support is None) or self.res_support.is_ok()))
2017-01-28 13:03:40 +00:00
def not_authorized(self):
return (self.res_overview.not_authorized() or
self.res_crx.not_authorized() or
(self.res_reviews is not None and
self.res_reviews.not_authorized()) or (
self.res_support is not None and
self.res_support.not_authorized()))
2017-01-28 13:03:40 +00:00
def not_in_store(self):
return (
self.res_overview.not_found() or self.res_crx.not_found() or
(self.res_reviews is not None and self.res_reviews.not_found()) or
(self.res_support is not None and self.res_support.not_found()))
def has_exception(self):
2017-06-19 15:42:35 +00:00
return (self.res_overview.has_exception() or
self.res_crx.has_exception() or
(self.res_reviews is not None and
self.res_reviews.has_exception()) or (
self.res_support is not None and
self.res_support.has_exception()))
2017-01-28 13:03:40 +00:00
def raised_google_ddos(self):
2017-06-19 15:42:35 +00:00
return ((self.res_reviews is not None and
self.res_reviews.not_available()) or
(self.res_support is not None and
self.res_support.not_available()))
2017-01-28 13:03:40 +00:00
def not_modified(self):
return self.res_crx.not_modified()
def corrupt_tar(self):
return self.exception is not None
2017-03-25 06:31:48 +00:00
def sql_exception(self):
return self.res_sql is not None
2017-06-19 15:42:35 +00:00
def sql_success(self):
return self.sql_update
2017-01-28 12:52:18 +00:00
2017-03-16 08:30:33 +00:00
def write_text(tardir, date, fname, text):
directory = os.path.join(tardir, date)
os.makedirs(directory, exist_ok=True)
with open(os.path.join(directory, fname), 'w') as f:
2017-01-28 12:52:18 +00:00
f.write(text)
2017-03-16 08:30:33 +00:00
def store_request_metadata(tar, date, fname, request):
write_text(tar, date, fname + ".headers", str(request.headers))
write_text(tar, date, fname + ".status", str(request.status_code))
write_text(tar, date, fname + ".url", str(request.url))
2017-01-28 12:52:18 +00:00
2017-03-16 08:30:33 +00:00
def store_request_text(tar, date, fname, request):
write_text(tar, date, fname, request.text)
store_request_metadata(tar, date, fname, request)
2017-01-28 12:52:18 +00:00
2017-01-28 13:15:05 +00:00
2017-01-28 12:52:18 +00:00
def httpdate(dt):
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
month = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec"
][dt.month - 1]
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
2017-01-28 12:52:18 +00:00
2017-03-16 08:30:33 +00:00
2017-01-28 12:52:18 +00:00
def last_modified_utc_date(path):
if path is "":
return ""
return os.path.split(os.path.dirname(path))[1]
def last_modified_http_date(path):
if path is "":
return ""
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
2017-01-28 13:15:05 +00:00
2017-03-16 08:30:33 +00:00
2017-07-05 07:21:40 +00:00
def last_crx(archivedir, extid, date=None):
last_crx = ""
2017-03-16 08:30:33 +00:00
tar = os.path.join(archivedir, get_local_archive_dir(extid),
extid + ".tar")
if os.path.exists(tar):
2017-04-13 08:34:33 +00:00
t = tarfile.open(tar, 'r')
2017-07-05 07:21:40 +00:00
old_crxs = sorted([
x.name for x in t.getmembers()
if x.name.endswith(".crx") and x.size > 0 and (date is None or (
dateutil.parser.parse(
os.path.split(os.path.split(x.name)[0])[1]) <= date))
2017-07-05 07:21:40 +00:00
])
t.close()
if old_crxs != []:
last_crx = old_crxs[-1]
2017-04-13 08:34:33 +00:00
return last_crx
2017-01-28 13:03:40 +00:00
2017-06-16 09:29:07 +00:00
def last_etag(archivedir, extid, crxfile):
etag = ""
tar = os.path.join(archivedir, get_local_archive_dir(extid),
extid + ".tar")
try:
if os.path.exists(tar):
t = tarfile.open(tar, 'r')
2017-06-16 09:29:07 +00:00
headers = eval((t.extractfile(crxfile + ".headers")).read())
etag = headers['ETag']
t.close()
except Exception:
return ""
return etag
2017-03-16 08:30:33 +00:00
def update_overview(tar, date, verbose, ext_id):
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, "", " * overview page: ")
2017-01-31 09:52:21 +00:00
res = None
2017-01-28 13:12:47 +00:00
try:
2017-02-05 14:11:09 +00:00
res = requests.get(const_overview_url(ext_id), timeout=10)
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, "{}".format(str(res.status_code)))
2017-03-16 08:30:33 +00:00
store_request_text(tar, date, 'overview.html', res)
2017-01-28 13:12:47 +00:00
except Exception as e:
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
write_text(tar, date, 'overview.html.exception',
traceback.format_exc())
return RequestResult(res, e), logtxt
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, "\n")
return RequestResult(res), logtxt
2017-01-28 13:12:47 +00:00
2017-01-28 15:05:56 +00:00
def validate_crx_response(res, extid, extfilename):
2017-01-28 13:12:47 +00:00
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
if not 'Content-Type' in res.headers:
raise CrawlError(extid, 'Did not find Content-Type header.',
'\n'.join(res.iter_lines()))
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
text = [line.decode('utf-8') for line in res.iter_lines()]
raise CrawlError(
extid,
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
format(res.headers['Content-Type']), '\n'.join(text))
if not regex_extfilename.match(extfilename):
raise CrawlError(
extid, '{} is not a valid extension file name, skipping...'.format(
extfilename))
def update_crx(archivedir, tmptardir, verbose, ext_id, date):
2017-01-31 09:52:21 +00:00
res = None
2017-02-05 14:11:09 +00:00
extfilename = "default_ext_archive.crx"
2017-04-05 20:42:00 +00:00
last_crx_file = last_crx(archivedir, ext_id)
2017-06-16 09:29:07 +00:00
last_crx_etag = last_etag(archivedir, ext_id, last_crx_file)
2017-01-28 13:12:47 +00:00
last_crx_http_date = last_modified_http_date(last_crx_file)
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, "",
" * crx archive (Last: {}): ".format(
2017-07-29 10:32:06 +00:00
value_of(last_crx_http_date, "n/a")))
2017-01-28 13:12:47 +00:00
headers = ""
if last_crx_file is not "":
headers = {'If-Modified-Since': last_crx_http_date}
try:
res = requests.get(const_download_url().format(ext_id),
stream=True,
headers=headers,
timeout=10)
logtxt = logmsg(verbose, logtxt, "{}\n".format(str(res.status_code)))
2017-01-28 13:12:47 +00:00
extfilename = os.path.basename(res.url)
2017-01-28 17:22:28 +00:00
if re.search('&', extfilename):
extfilename = "default.crx"
2017-01-28 13:12:47 +00:00
if res.status_code == 304:
2017-06-20 17:05:22 +00:00
etag = requests.head(
2017-06-16 09:29:07 +00:00
const_download_url().format(ext_id),
timeout=10,
2017-06-20 17:05:22 +00:00
allow_redirects=True).headers.get('ETag')
2017-06-16 19:40:48 +00:00
write_text(tmptardir, date, extfilename + ".etag", etag)
logtxt = logmsg(verbose, logtxt, (
" - checking etag, last: {}\n" +
" current: {}\n").format(
last_crx_etag, etag))
if (etag is not "") and (etag != last_crx_etag):
2017-06-16 09:29:07 +00:00
logtxt = logmsg(
verbose, logtxt,
" - downloading due to different etags\n")
res = requests.get(const_download_url().format(ext_id),
stream=True,
timeout=10)
else:
write_text(tmptardir, date, extfilename + ".link",
os.path.join("..",
last_modified_utc_date(last_crx_file),
extfilename) + "\n")
store_request_metadata(tmptardir, date, extfilename, res)
if res.status_code == 200:
2017-01-28 15:05:56 +00:00
validate_crx_response(res, ext_id, extfilename)
with open(os.path.join(tmptardir, date, extfilename), 'wb') as f:
2017-01-28 13:12:47 +00:00
for chunk in res.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
2017-06-16 22:19:13 +00:00
write_text(tmptardir, date, extfilename + ".etag",
res.headers.get("ETag"))
2017-01-28 13:12:47 +00:00
except Exception as e:
2017-06-16 09:29:07 +00:00
logtxt = logmsg(verbose, logtxt,
" - Exception: {}\n".format(str(e)))
write_text(tmptardir, date, extfilename + ".exception",
traceback.format_exc())
return RequestResult(res, e), logtxt
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, "\n")
return RequestResult(res), logtxt
2017-01-28 13:12:47 +00:00
2017-07-12 15:10:47 +00:00
def iterate_authors(pages):
for page in pages:
json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
2017-07-12 15:10:47 +00:00
for annotation in json_page["annotations"]:
if "attributes" in annotation and "replyExists" in annotation[
"attributes"] and annotation["attributes"]["replyExists"]:
yield (annotation["entity"]["author"],
annotation["entity"]["groups"])
2017-07-12 15:10:47 +00:00
2017-03-16 08:30:33 +00:00
def update_reviews(tar, date, verbose, ext_id):
dir = os.path.join(os.path.splitext(tar)[0], date)
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, "", " * review page: ")
2017-01-28 13:12:47 +00:00
res = None
try:
2017-07-12 15:10:47 +00:00
pages = []
google_dos_protection()
2017-01-28 13:12:47 +00:00
res = requests.post(
2017-02-05 14:11:09 +00:00
const_review_url(),
data=const_review_payload(ext_id, "0", "100"),
timeout=10)
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, "{}/".format(str(res.status_code)))
2017-03-16 08:30:33 +00:00
store_request_text(tar, date, 'reviews000-099.text', res)
2017-07-12 15:10:47 +00:00
pages += [res.text]
2017-01-28 13:12:47 +00:00
google_dos_protection()
res = requests.post(
2017-02-05 14:11:09 +00:00
const_review_url(),
data=const_review_payload(ext_id, "100", "100"),
2017-02-05 14:11:09 +00:00
timeout=10)
2017-07-12 15:10:47 +00:00
logtxt = logmsg(verbose, logtxt, "{}/".format(str(res.status_code)))
2017-03-16 08:30:33 +00:00
store_request_text(tar, date, 'reviews100-199.text', res)
2017-07-12 15:10:47 +00:00
pages += [res.text]
google_dos_protection()
# Always start with reply number 0 and request 10 replies
ext_id_author_tups = [(ext_id, author, 0, 10, groups)
for author, groups in iterate_authors(pages)]
if ext_id_author_tups:
res = requests.post(
const_review_search_url(),
data=const_review_search_payload(ext_id_author_tups),
timeout=10)
logtxt = logmsg(verbose, logtxt, "{}".format(str(res.status_code)))
store_request_text(tar, date, 'reviewsreplies.text', res)
else:
logtxt = logmsg(verbose, logtxt, "-")
2017-01-28 13:12:47 +00:00
except Exception as e:
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
write_text(tar, date, 'reviews.html.exception', traceback.format_exc())
return RequestResult(res, e), logtxt
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, "\n")
return RequestResult(res), logtxt
2017-01-28 13:12:47 +00:00
def update_support(tar, date, verbose, ext_id):
2017-03-16 08:30:33 +00:00
dir = os.path.join(os.path.splitext(tar)[0], date)
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, "", " * support page: ")
2017-01-28 13:12:47 +00:00
res = None
try:
pages = []
google_dos_protection()
2017-01-28 13:12:47 +00:00
res = requests.post(
const_support_url(),
2017-02-05 14:11:09 +00:00
data=const_support_payload(ext_id, "0", "100"),
timeout=10)
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, "{}/".format(str(res.status_code)))
2017-03-16 08:30:33 +00:00
store_request_text(tar, date, 'support000-099.text', res)
pages += [res.text]
2017-01-28 13:12:47 +00:00
google_dos_protection()
res = requests.post(
const_support_url(),
2017-02-05 14:11:09 +00:00
data=const_support_payload(ext_id, "100", "100"),
timeout=10)
logtxt = logmsg(verbose, logtxt, "{}/".format(str(res.status_code)))
2017-03-16 08:30:33 +00:00
store_request_text(tar, date, 'support100-199.text', res)
pages += [res.text]
google_dos_protection()
# Always start with reply number 0 and request 10 replies
ext_id_author_tups = [(ext_id, author, 0, 10, groups)
for author, groups in iterate_authors(pages)]
if ext_id_author_tups:
res = requests.post(
const_review_search_url(),
data=const_review_search_payload(ext_id_author_tups),
timeout=10)
logtxt = logmsg(verbose, logtxt, "{}".format(str(res.status_code)))
store_request_text(tar, date, 'supportreplies.text', res)
else:
logtxt = logmsg(verbose, logtxt, "-")
2017-01-28 13:12:47 +00:00
except Exception as e:
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
write_text(tar, date, 'support.html.exception', traceback.format_exc())
return RequestResult(res, e), logtxt
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, "\n")
return RequestResult(res), logtxt
2017-01-28 13:12:47 +00:00
def update_extension(archivedir, verbose, forums, ext_id):
logtxt = logmsg(verbose, "", " Updating extension {}".format(ext_id))
is_new = False
tar_exception = None
sql_exception = None
sql_success = False
2017-04-02 08:23:13 +00:00
tmptardir = ""
start = time.time()
2017-04-02 08:23:13 +00:00
2017-01-28 13:12:47 +00:00
if forums:
2017-01-28 17:22:28 +00:00
logtxt = logmsg(verbose, logtxt, " (including forums)")
logtxt = logmsg(verbose, logtxt, "\n")
2017-02-03 09:45:51 +00:00
date = datetime.datetime.now(datetime.timezone.utc).isoformat()
2017-03-16 08:30:33 +00:00
tardir = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id)
tar = (tardir + ".tar")
try:
tmpdir = tempfile.mkdtemp()
2017-04-05 20:42:00 +00:00
tmptardir = os.path.join(tmpdir, ext_id)
logtxt = logmsg(verbose, logtxt,
" * tmptardir = {}\n".format(tmptardir))
os.makedirs(
2017-04-05 20:42:00 +00:00
os.path.join(archivedir, get_local_archive_dir(ext_id)),
exist_ok=True)
except Exception as e:
logtxt = logmsg(verbose, logtxt,
2017-04-05 20:42:00 +00:00
" * FATAL: cannot create tmpdir")
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
tar_exception = e
logtxt = logmsg(
verbose,
logtxt,
" * Duration: {}\n".format(
datetime.timedelta(seconds=int(time.time() - start))))
log(verbose, logtxt)
return UpdateResult(ext_id, is_new, tar_exception, None,
None, None, None, sql_exception,
2017-06-19 15:42:35 +00:00
False)
2017-04-05 20:42:00 +00:00
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
ext_id)
2017-01-28 13:12:47 +00:00
res_reviews = None
msg_reviews = ""
2017-01-28 13:12:47 +00:00
res_support = None
msg_support = ""
2017-01-28 13:12:47 +00:00
if forums:
res_reviews, msg_reviews = update_reviews(tmptardir, date, verbose,
2017-04-02 08:23:13 +00:00
ext_id)
2017-04-13 08:34:33 +00:00
res_crx, msg_crx = update_crx(archivedir, tmptardir, verbose, ext_id, date)
2017-04-13 08:34:33 +00:00
if forums:
res_support, msg_support = update_support(tmptardir, date, verbose,
2017-04-02 08:23:13 +00:00
ext_id)
2017-06-16 11:07:51 +00:00
logtxt = logtxt + msg_overview + msg_crx + msg_reviews + msg_support
2017-05-20 20:09:03 +00:00
backup = False
if backup:
try:
2017-05-20 20:09:03 +00:00
os.sync()
if os.path.exists(tardir + "bak.tar"):
2017-05-27 19:38:56 +00:00
shutil.move(tardir + ".bak.tar",
tardir + ".bak." + date + ".tar")
2017-05-20 20:09:03 +00:00
os.remove(tardir + ".bak." + date + ".tar")
except Exception:
pass
2017-05-20 20:09:03 +00:00
try:
if os.path.exists(tar):
shutil.copyfile(tar, tardir + ".bak.tar")
except Exception as e:
2017-05-27 19:38:56 +00:00
logtxt = logmsg(
verbose, logtxt,
" * FATAL: cannot rename old tar archive")
2017-05-20 20:09:03 +00:00
logtxt = logmsg(verbose, logtxt,
2017-05-27 19:38:56 +00:00
" / Exception: {}\n".format(str(e)))
2017-05-20 20:09:03 +00:00
tar_exception = e
try:
2017-05-27 19:38:56 +00:00
write_text(tardir, date, ext_id + ".tar.rename.exception",
traceback.format_exc())
2017-05-20 20:09:03 +00:00
except Exception:
pass
2017-04-07 06:07:49 +00:00
if not os.path.exists(tar):
2017-04-13 08:34:33 +00:00
is_new = True
try:
ar = tarfile.open(tar, mode='a:')
2017-03-31 16:04:51 +00:00
ar.add(tmptardir, arcname=ext_id)
ar.close()
except Exception as e:
logtxt = logmsg(verbose, logtxt,
" * FATAL: cannot create tar archive")
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
tar_exception = e
try:
write_text(tardir, date, ext_id + ".tar.create.exception",
traceback.format_exc())
except Exception:
pass
2017-06-16 11:07:51 +00:00
2017-06-16 19:40:48 +00:00
try:
2017-06-19 15:42:35 +00:00
logtxt = logmsg(verbose, logtxt, " * Updating db...\n")
2017-06-22 16:46:18 +00:00
db_path = db_file(archivedir, ext_id)
2017-06-19 15:42:35 +00:00
msg_updatesqlite = update_sqlite_incremental(
2017-06-22 16:46:18 +00:00
db_path, tmptardir, ext_id, date, verbose, 15 * " ")
2017-06-16 19:40:48 +00:00
logtxt = logmsg(verbose, logtxt, msg_updatesqlite)
2017-06-19 15:42:35 +00:00
sql_success = True
2017-06-16 19:40:48 +00:00
except Exception as e:
2017-06-16 22:19:13 +00:00
logtxt = logmsg(verbose, logtxt,
" * Exception during update of sqlite db ")
2017-06-16 19:40:48 +00:00
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
sql_exception = e
2017-06-16 19:40:48 +00:00
try:
write_text(tardir, date, ext_id + ".sql.exception",
traceback.format_exc())
except Exception as e:
pass
try:
shutil.rmtree(path=tmpdir)
except Exception as e:
logtxt = logmsg(verbose, logtxt,
" * FATAL: cannot remove archive directory")
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
tar_exception = e
try:
write_text(tardir, date, ext_id + ".dir.remove.exception",
traceback.format_exc())
except Exception:
pass
logtxt = logmsg(
verbose,
logtxt,
" * Duration: {}\n".format(
datetime.timedelta(seconds=int(time.time() - start))))
2017-06-16 23:48:34 +00:00
log(verbose, logtxt)
2017-03-25 06:31:48 +00:00
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
res_reviews, res_support, sql_exception, sql_success)
2017-01-28 13:12:47 +00:00
2017-03-16 08:30:33 +00:00
def update_extensions(archivedir, verbose, parallel, forums_ext_ids, ext_ids):
2017-01-28 17:22:28 +00:00
ext_with_forums = []
ext_without_forums = []
ext_ids = list(set(ext_ids) - set(forums_ext_ids))
forums_ext_ids = list(set(forums_ext_ids))
2017-02-05 14:11:09 +00:00
log(verbose, "Updating {} extensions ({} including forums)\n".format(
len(ext_ids), len(forums_ext_ids)))
2017-01-28 16:37:44 +00:00
# First, update extensions with forums sequentially (and with delays) to
2017-06-16 19:40:48 +00:00
# avoid running into Googles DDOS detection.
2017-01-28 16:37:44 +00:00
log(verbose,
2017-01-28 17:22:28 +00:00
" Updating {} extensions including forums (sequentially))\n".format(
len(forums_ext_ids)))
2017-01-28 17:22:28 +00:00
ext_with_forums = list(
map(
partial(update_extension, archivedir, verbose, True),
forums_ext_ids))
2017-01-28 13:12:47 +00:00
2017-01-28 16:37:44 +00:00
# Second, update extensions without forums parallel to increase speed.
parallel_ids = list(set(ext_ids) - set(forums_ext_ids))
log(verbose,
2017-01-28 17:22:28 +00:00
" Updating {} extensions excluding forums (parallel))\n".format(
len(parallel_ids)))
with Pool(parallel) as p:
ext_without_forums = list(
p.map(
partial(update_extension, archivedir, verbose, False),
parallel_ids))
2017-01-28 17:22:28 +00:00
return ext_with_forums + ext_without_forums
2017-01-28 13:12:47 +00:00
def get_existing_ids(archivedir, verbose):
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
word = byte + byte + byte + byte
return list(
map(lambda d: re.sub(".tar$", "", re.sub(r"^.*\/", "", d)),
2017-03-16 08:30:33 +00:00
glob.glob(os.path.join(archivedir, "*", word + ".tar"))))
2017-01-28 13:12:47 +00:00
def get_forum_ext_ids(confdir, verbose):
with open(os.path.join(confdir, "forums.conf")) as f:
ids = f.readlines()
r = re.compile('^[a-p]+$')
2017-01-28 13:12:47 +00:00
ids = [x.strip() for x in ids]
return list(filter(r.match, ids))