2017-11-07 20:58:24 +00:00
|
|
|
#!/usr/bin/env python3.6
|
2017-01-28 12:52:18 +00:00
|
|
|
#
|
|
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
2017-06-16 19:40:48 +00:00
|
|
|
#
|
2017-01-28 12:52:18 +00:00
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
2017-07-29 15:13:39 +00:00
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
2017-01-28 12:52:18 +00:00
|
|
|
#
|
2017-07-29 15:13:39 +00:00
|
|
|
"""
|
|
|
|
Module for handling archives of the Browser Extension Crawler.
|
|
|
|
"""
|
|
|
|
|
2017-01-28 12:52:18 +00:00
|
|
|
import os
|
|
|
|
import glob
|
|
|
|
import re
|
2017-07-29 15:13:39 +00:00
|
|
|
import json
|
2018-04-08 09:06:26 +00:00
|
|
|
from multiprocessing import Pool
|
2018-04-05 21:51:27 +00:00
|
|
|
from concurrent.futures import TimeoutError
|
2018-04-07 20:13:32 +00:00
|
|
|
from pebble import ProcessPool, ProcessExpired
|
2017-01-28 16:37:44 +00:00
|
|
|
from functools import partial
|
2017-03-13 07:02:17 +00:00
|
|
|
import shutil
|
2017-03-31 16:04:51 +00:00
|
|
|
import tempfile
|
2017-04-10 06:08:52 +00:00
|
|
|
import time
|
2017-07-17 13:00:39 +00:00
|
|
|
import traceback
|
2017-07-29 15:13:39 +00:00
|
|
|
import tarfile
|
|
|
|
import datetime
|
|
|
|
import dateutil
|
|
|
|
import dateutil.parser
|
|
|
|
import requests
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
from ExtensionCrawler.config import (
|
|
|
|
const_review_payload, const_review_search_url, const_download_url,
|
|
|
|
get_local_archive_dir, const_overview_url, const_support_url,
|
|
|
|
const_support_payload, const_review_search_payload, const_review_url)
|
2018-04-08 16:44:59 +00:00
|
|
|
from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_warning, log_exception
|
2017-08-30 14:38:04 +00:00
|
|
|
from ExtensionCrawler.db import update_db_incremental
|
2017-04-02 08:23:13 +00:00
|
|
|
|
2017-07-05 07:21:40 +00:00
|
|
|
|
2017-01-28 13:03:40 +00:00
|
|
|
class Error(Exception):
|
|
|
|
pass
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
|
2017-01-28 13:03:40 +00:00
|
|
|
class CrawlError(Error):
|
2017-12-14 00:02:26 +00:00
|
|
|
def __init__(self, extid="", message="", pagecontent=""):
|
2017-01-28 13:03:40 +00:00
|
|
|
self.extid = extid
|
|
|
|
self.message = message
|
|
|
|
self.pagecontent = pagecontent
|
2017-07-29 15:13:39 +00:00
|
|
|
super(CrawlError, self).__init__()
|
2017-01-28 13:03:40 +00:00
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
|
2017-01-28 13:03:40 +00:00
|
|
|
class RequestResult:
|
|
|
|
def __init__(self, response=None, exception=None):
|
|
|
|
if response is not None:
|
|
|
|
self.http_status = response.status_code
|
|
|
|
self.exception = exception
|
|
|
|
|
|
|
|
def is_ok(self):
|
2017-01-28 13:15:05 +00:00
|
|
|
return (self.exception is None) and (self.http_status == 200)
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def not_authorized(self):
|
2017-01-28 13:15:05 +00:00
|
|
|
return (self.exception is None) and (self.http_status == 401)
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def not_found(self):
|
2017-01-28 13:15:05 +00:00
|
|
|
return (self.exception is None) and (self.http_status == 404)
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def has_exception(self):
|
|
|
|
return self.exception is not None
|
|
|
|
|
|
|
|
def not_available(self):
|
2017-01-28 13:15:05 +00:00
|
|
|
return (self.exception is None) and (self.http_status == 503)
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def not_modified(self):
|
2017-07-29 15:13:39 +00:00
|
|
|
return (self.exception is None) and (self.http_status == 304)
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
|
|
|
|
class UpdateResult:
|
2017-03-25 06:31:48 +00:00
|
|
|
def __init__(self, id, is_new, exception, res_overview, res_crx,
|
2017-06-19 15:42:35 +00:00
|
|
|
res_reviews, res_support, res_sql, sql_update):
|
2017-01-28 13:03:40 +00:00
|
|
|
self.id = id
|
2017-01-29 13:29:46 +00:00
|
|
|
self.new = is_new
|
2017-03-23 20:32:51 +00:00
|
|
|
self.exception = exception
|
2017-01-28 13:03:40 +00:00
|
|
|
self.res_overview = res_overview
|
|
|
|
self.res_crx = res_crx
|
|
|
|
self.res_reviews = res_reviews
|
|
|
|
self.res_support = res_support
|
2017-06-16 23:43:40 +00:00
|
|
|
self.res_sql = res_sql
|
2017-06-17 17:15:08 +00:00
|
|
|
self.sql_update = sql_update
|
2017-01-28 13:03:40 +00:00
|
|
|
|
2017-01-29 13:29:46 +00:00
|
|
|
def is_new(self):
|
|
|
|
return self.new
|
2017-02-05 14:11:09 +00:00
|
|
|
|
2017-01-28 13:03:40 +00:00
|
|
|
def is_ok(self):
|
2017-08-29 21:29:38 +00:00
|
|
|
return (self.res_overview.is_ok()
|
|
|
|
and (self.res_crx.is_ok() or self.res_crx.not_modified())
|
|
|
|
and ((self.res_reviews is None) or self.res_reviews.is_ok())
|
|
|
|
and ((self.res_support is None) or self.res_support.is_ok()))
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def not_authorized(self):
|
2018-04-08 16:44:59 +00:00
|
|
|
return ((self.res_overview is not None and self.res_overview.not_authorized())
|
|
|
|
or (self.res_crx is not None and self.res_crx.not_authorized())
|
2017-08-29 21:29:38 +00:00
|
|
|
or (self.res_reviews is not None
|
|
|
|
and self.res_reviews.not_authorized())
|
|
|
|
or (self.res_support is not None
|
|
|
|
and self.res_support.not_authorized()))
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def not_in_store(self):
|
2018-04-08 16:44:59 +00:00
|
|
|
return ((self.res_overview is not None and self.res_overview.not_found())
|
|
|
|
or (self.res_crx is not None and self.res_crx.not_found())
|
|
|
|
or (self.res_reviews is not None and self.res_reviews.not_found())
|
|
|
|
or (self.res_support is not None and self.res_support.not_found()))
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def has_exception(self):
|
2018-04-08 16:44:59 +00:00
|
|
|
return ((self.res_overview is not None and self.res_overview.has_exception())
|
|
|
|
or (self.res_crx is not None and self.res_crx.has_exception())
|
2017-08-29 21:29:38 +00:00
|
|
|
or (self.res_reviews is not None
|
|
|
|
and self.res_reviews.has_exception())
|
|
|
|
or (self.res_support is not None
|
|
|
|
and self.res_support.has_exception()))
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def raised_google_ddos(self):
|
2017-08-29 21:29:38 +00:00
|
|
|
return ((self.res_reviews is not None
|
|
|
|
and self.res_reviews.not_available())
|
|
|
|
or (self.res_support is not None
|
|
|
|
and self.res_support.not_available()))
|
2017-01-28 13:03:40 +00:00
|
|
|
|
|
|
|
def not_modified(self):
|
2018-04-08 16:44:59 +00:00
|
|
|
return self.res_crx is None or self.res_crx.not_modified()
|
2017-01-28 13:03:40 +00:00
|
|
|
|
2017-03-23 20:32:51 +00:00
|
|
|
def corrupt_tar(self):
|
|
|
|
return self.exception is not None
|
2017-03-25 06:31:48 +00:00
|
|
|
|
2017-06-16 23:43:40 +00:00
|
|
|
def sql_exception(self):
|
|
|
|
return self.res_sql is not None
|
2017-06-19 15:42:35 +00:00
|
|
|
|
2017-06-17 17:15:08 +00:00
|
|
|
def sql_success(self):
|
|
|
|
return self.sql_update
|
2017-01-28 12:52:18 +00:00
|
|
|
|
2017-03-16 08:30:33 +00:00
|
|
|
|
2017-04-03 20:32:53 +00:00
|
|
|
def write_text(tardir, date, fname, text):
|
2017-07-29 15:13:39 +00:00
|
|
|
directory = os.path.join(tardir, date)
|
|
|
|
os.makedirs(directory, exist_ok=True)
|
|
|
|
with open(os.path.join(directory, fname), 'w') as f:
|
2017-01-28 12:52:18 +00:00
|
|
|
f.write(text)
|
|
|
|
|
|
|
|
|
2017-03-16 08:30:33 +00:00
|
|
|
def store_request_metadata(tar, date, fname, request):
|
|
|
|
write_text(tar, date, fname + ".headers", str(request.headers))
|
|
|
|
write_text(tar, date, fname + ".status", str(request.status_code))
|
|
|
|
write_text(tar, date, fname + ".url", str(request.url))
|
2017-01-28 12:52:18 +00:00
|
|
|
|
|
|
|
|
2017-03-16 08:30:33 +00:00
|
|
|
def store_request_text(tar, date, fname, request):
|
|
|
|
write_text(tar, date, fname, request.text)
|
|
|
|
store_request_metadata(tar, date, fname, request)
|
2017-01-28 12:52:18 +00:00
|
|
|
|
2017-01-28 13:15:05 +00:00
|
|
|
|
2017-01-28 12:52:18 +00:00
|
|
|
def httpdate(dt):
|
|
|
|
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
|
|
|
|
month = [
|
|
|
|
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
|
|
|
|
"Nov", "Dec"
|
|
|
|
][dt.month - 1]
|
2017-08-29 21:29:38 +00:00
|
|
|
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (weekday, dt.day, month,
|
|
|
|
dt.year, dt.hour,
|
|
|
|
dt.minute, dt.second)
|
2017-01-28 12:52:18 +00:00
|
|
|
|
2017-03-16 08:30:33 +00:00
|
|
|
|
2017-01-28 12:52:18 +00:00
|
|
|
def last_modified_utc_date(path):
|
|
|
|
if path is "":
|
|
|
|
return ""
|
|
|
|
return os.path.split(os.path.dirname(path))[1]
|
|
|
|
|
|
|
|
|
|
|
|
def last_modified_http_date(path):
|
|
|
|
if path is "":
|
|
|
|
return ""
|
|
|
|
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
|
2017-01-28 13:15:05 +00:00
|
|
|
|
2017-03-16 08:30:33 +00:00
|
|
|
|
2017-07-05 07:21:40 +00:00
|
|
|
def last_crx(archivedir, extid, date=None):
|
2017-03-01 12:48:29 +00:00
|
|
|
last_crx = ""
|
2017-10-14 18:59:46 +00:00
|
|
|
last_crx_etag = ""
|
2017-03-16 08:30:33 +00:00
|
|
|
tar = os.path.join(archivedir, get_local_archive_dir(extid),
|
|
|
|
extid + ".tar")
|
2017-03-01 12:48:29 +00:00
|
|
|
if os.path.exists(tar):
|
2017-10-14 18:59:46 +00:00
|
|
|
with tarfile.open(tar, 'r') as t:
|
|
|
|
old_crxs = sorted([
|
|
|
|
x.name for x in t.getmembers()
|
|
|
|
if x.name.endswith(".crx") and x.size > 0 and (
|
|
|
|
date is None or (dateutil.parser.parse(
|
|
|
|
os.path.split(os.path.split(x.name)[0])[1]) <= date))
|
|
|
|
])
|
|
|
|
if old_crxs != []:
|
|
|
|
last_crx = old_crxs[-1]
|
2017-11-26 23:35:35 +00:00
|
|
|
headers_content = t.extractfile(
|
|
|
|
last_crx + ".headers").read().decode().replace(
|
|
|
|
'"', '\\"').replace("'", '"')
|
2017-10-14 18:59:46 +00:00
|
|
|
headers_json = json.loads(headers_content)
|
|
|
|
last_crx_etag = headers_json["ETag"]
|
|
|
|
|
|
|
|
return last_crx, last_crx_etag
|
2017-01-28 13:03:40 +00:00
|
|
|
|
2017-11-26 23:35:35 +00:00
|
|
|
|
2017-10-06 17:33:35 +00:00
|
|
|
def first_crx(archivedir, extid, date=None):
|
|
|
|
first_crx = ""
|
|
|
|
tar = os.path.join(archivedir, get_local_archive_dir(extid),
|
|
|
|
extid + ".tar")
|
|
|
|
if os.path.exists(tar):
|
|
|
|
t = tarfile.open(tar, 'r')
|
|
|
|
old_crxs = sorted([
|
|
|
|
x.name for x in t.getmembers()
|
|
|
|
if x.name.endswith(".crx") and x.size > 0 and (
|
|
|
|
date is None or (date <= dateutil.parser.parse(
|
|
|
|
os.path.split(os.path.split(x.name)[0])[1])))
|
|
|
|
])
|
|
|
|
t.close()
|
|
|
|
if old_crxs != []:
|
|
|
|
first_crx = old_crxs[0]
|
|
|
|
|
|
|
|
return first_crx
|
|
|
|
|
2017-10-14 18:59:46 +00:00
|
|
|
|
2017-10-06 17:33:35 +00:00
|
|
|
def all_crx(archivedir, extid, date=None):
|
|
|
|
tar = os.path.join(archivedir, get_local_archive_dir(extid),
|
|
|
|
extid + ".tar")
|
|
|
|
all_crxs = []
|
|
|
|
if os.path.exists(tar):
|
|
|
|
t = tarfile.open(tar, 'r')
|
|
|
|
all_crxs = sorted([
|
|
|
|
x.name for x in t.getmembers()
|
|
|
|
if x.name.endswith(".crx") and x.size > 0
|
|
|
|
])
|
|
|
|
t.close()
|
|
|
|
return all_crxs
|
|
|
|
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
def update_overview(tar, date, ext_id):
|
2017-01-31 09:52:21 +00:00
|
|
|
res = None
|
2017-01-28 13:12:47 +00:00
|
|
|
try:
|
2017-02-05 14:11:09 +00:00
|
|
|
res = requests.get(const_overview_url(ext_id), timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* overview page: {}".format(str(res.status_code)), 2, ext_id)
|
2017-03-16 08:30:33 +00:00
|
|
|
store_request_text(tar, date, 'overview.html', res)
|
2017-01-28 13:12:47 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("Exception when retrieving overview page", 2, ext_id)
|
2017-07-17 13:00:39 +00:00
|
|
|
write_text(tar, date, 'overview.html.exception',
|
|
|
|
traceback.format_exc())
|
2017-08-29 21:29:38 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
return RequestResult(res)
|
2017-01-28 13:12:47 +00:00
|
|
|
|
|
|
|
|
2017-01-28 15:05:56 +00:00
|
|
|
def validate_crx_response(res, extid, extfilename):
|
2017-01-28 13:12:47 +00:00
|
|
|
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
|
|
|
if not 'Content-Type' in res.headers:
|
2017-11-26 23:35:35 +00:00
|
|
|
raise CrawlError(extid, 'Did not find Content-Type header.', '\n'.join(
|
|
|
|
res.iter_lines()))
|
2017-01-28 13:12:47 +00:00
|
|
|
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
|
|
|
|
text = [line.decode('utf-8') for line in res.iter_lines()]
|
|
|
|
raise CrawlError(
|
|
|
|
extid,
|
|
|
|
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
|
|
|
|
format(res.headers['Content-Type']), '\n'.join(text))
|
|
|
|
if not regex_extfilename.match(extfilename):
|
|
|
|
raise CrawlError(
|
|
|
|
extid, '{} is not a valid extension file name, skipping...'.format(
|
|
|
|
extfilename))
|
|
|
|
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
def update_crx(archivedir, tmptardir, ext_id, date):
|
2017-01-31 09:52:21 +00:00
|
|
|
res = None
|
2017-02-05 14:11:09 +00:00
|
|
|
extfilename = "default_ext_archive.crx"
|
2017-10-14 18:59:46 +00:00
|
|
|
last_crx_file, last_crx_etag = last_crx(archivedir, ext_id)
|
2017-01-28 13:12:47 +00:00
|
|
|
last_crx_http_date = last_modified_http_date(last_crx_file)
|
|
|
|
headers = ""
|
|
|
|
if last_crx_file is not "":
|
|
|
|
headers = {'If-Modified-Since': last_crx_http_date}
|
|
|
|
try:
|
2018-04-04 22:14:24 +00:00
|
|
|
log_info("* Checking If-Modified-Since")
|
2017-08-29 21:29:38 +00:00
|
|
|
res = requests.get(
|
|
|
|
const_download_url().format(ext_id),
|
|
|
|
stream=True,
|
|
|
|
headers=headers,
|
|
|
|
timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* crx archive (Last: {}): {}".format(
|
|
|
|
value_of(last_crx_http_date, "n/a"), str(res.status_code)), 2,
|
|
|
|
ext_id)
|
2017-01-28 13:12:47 +00:00
|
|
|
extfilename = os.path.basename(res.url)
|
2017-01-28 17:22:28 +00:00
|
|
|
if re.search('&', extfilename):
|
|
|
|
extfilename = "default.crx"
|
2017-03-13 07:02:17 +00:00
|
|
|
|
2017-01-28 13:12:47 +00:00
|
|
|
if res.status_code == 304:
|
2017-06-20 17:05:22 +00:00
|
|
|
etag = requests.head(
|
2017-06-16 09:29:07 +00:00
|
|
|
const_download_url().format(ext_id),
|
|
|
|
timeout=10,
|
2017-06-20 17:05:22 +00:00
|
|
|
allow_redirects=True).headers.get('ETag')
|
2017-06-16 19:40:48 +00:00
|
|
|
write_text(tmptardir, date, extfilename + ".etag", etag)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("- checking etag, last: {}".format(last_crx_etag), 3,
|
|
|
|
ext_id)
|
|
|
|
log_info(" current: {}".format(etag), 3, ext_id)
|
2017-06-16 09:28:47 +00:00
|
|
|
|
2017-07-29 15:13:39 +00:00
|
|
|
if (etag is not "") and (etag != last_crx_etag):
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("- downloading due to different etags", 3, ext_id)
|
2017-06-16 09:28:47 +00:00
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
res = requests.get(
|
|
|
|
const_download_url().format(ext_id),
|
|
|
|
stream=True,
|
|
|
|
timeout=10)
|
2017-06-16 09:28:47 +00:00
|
|
|
else:
|
|
|
|
write_text(tmptardir, date, extfilename + ".link",
|
|
|
|
os.path.join("..",
|
|
|
|
last_modified_utc_date(last_crx_file),
|
|
|
|
extfilename) + "\n")
|
|
|
|
store_request_metadata(tmptardir, date, extfilename, res)
|
|
|
|
if res.status_code == 200:
|
2017-01-28 15:05:56 +00:00
|
|
|
validate_crx_response(res, ext_id, extfilename)
|
2017-04-04 21:51:45 +00:00
|
|
|
with open(os.path.join(tmptardir, date, extfilename), 'wb') as f:
|
2017-01-28 13:12:47 +00:00
|
|
|
for chunk in res.iter_content(chunk_size=512 * 1024):
|
|
|
|
if chunk: # filter out keep-alive new chunks
|
|
|
|
f.write(chunk)
|
2017-06-16 22:19:13 +00:00
|
|
|
write_text(tmptardir, date, extfilename + ".etag",
|
|
|
|
res.headers.get("ETag"))
|
2017-01-28 13:12:47 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("Exception when updating crx", 3, ext_id)
|
2017-07-17 13:00:39 +00:00
|
|
|
write_text(tmptardir, date, extfilename + ".exception",
|
|
|
|
traceback.format_exc())
|
2017-08-29 21:29:38 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
return RequestResult(res)
|
2017-01-28 13:12:47 +00:00
|
|
|
|
|
|
|
|
2017-07-12 15:10:47 +00:00
|
|
|
def iterate_authors(pages):
|
|
|
|
for page in pages:
|
2017-07-17 13:00:39 +00:00
|
|
|
json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
|
2017-07-12 15:10:47 +00:00
|
|
|
for annotation in json_page["annotations"]:
|
2017-08-29 21:29:38 +00:00
|
|
|
if "attributes" in annotation and "replyExists" in annotation["attributes"] and annotation["attributes"]["replyExists"]:
|
2017-07-17 13:00:39 +00:00
|
|
|
yield (annotation["entity"]["author"],
|
|
|
|
annotation["entity"]["groups"])
|
2017-07-12 15:10:47 +00:00
|
|
|
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
def update_reviews(tar, date, ext_id):
|
2017-01-28 13:12:47 +00:00
|
|
|
res = None
|
|
|
|
try:
|
2017-07-12 15:10:47 +00:00
|
|
|
pages = []
|
|
|
|
|
2018-04-03 21:20:09 +00:00
|
|
|
# google_dos_protection()
|
2017-01-28 13:12:47 +00:00
|
|
|
res = requests.post(
|
2017-02-05 14:11:09 +00:00
|
|
|
const_review_url(),
|
|
|
|
data=const_review_payload(ext_id, "0", "100"),
|
|
|
|
timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* review page 0-100: {}".format(str(res.status_code)), 2,
|
|
|
|
ext_id)
|
2017-03-16 08:30:33 +00:00
|
|
|
store_request_text(tar, date, 'reviews000-099.text', res)
|
2017-07-12 15:10:47 +00:00
|
|
|
pages += [res.text]
|
|
|
|
|
2017-01-28 13:12:47 +00:00
|
|
|
google_dos_protection()
|
|
|
|
res = requests.post(
|
2017-02-05 14:11:09 +00:00
|
|
|
const_review_url(),
|
2017-07-12 13:09:01 +00:00
|
|
|
data=const_review_payload(ext_id, "100", "100"),
|
2017-02-05 14:11:09 +00:00
|
|
|
timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* review page 100-200: {}".format(str(res.status_code)), 2,
|
|
|
|
ext_id)
|
2017-03-16 08:30:33 +00:00
|
|
|
store_request_text(tar, date, 'reviews100-199.text', res)
|
2017-07-12 15:10:47 +00:00
|
|
|
pages += [res.text]
|
|
|
|
|
|
|
|
google_dos_protection()
|
|
|
|
# Always start with reply number 0 and request 10 replies
|
2017-07-12 15:57:16 +00:00
|
|
|
ext_id_author_tups = [(ext_id, author, 0, 10, groups)
|
|
|
|
for author, groups in iterate_authors(pages)]
|
2017-07-16 19:14:50 +00:00
|
|
|
if ext_id_author_tups:
|
|
|
|
res = requests.post(
|
|
|
|
const_review_search_url(),
|
|
|
|
data=const_review_search_payload(ext_id_author_tups),
|
|
|
|
timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* review page replies: {}".format(str(res.status_code)),
|
|
|
|
2, ext_id)
|
2017-07-16 19:14:50 +00:00
|
|
|
store_request_text(tar, date, 'reviewsreplies.text', res)
|
2017-01-28 13:12:47 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("Exception when updating reviews", 2, ext_id)
|
2017-07-17 13:00:39 +00:00
|
|
|
write_text(tar, date, 'reviews.html.exception', traceback.format_exc())
|
2017-08-29 21:29:38 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
return RequestResult(res)
|
2017-01-28 13:12:47 +00:00
|
|
|
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
def update_support(tar, date, ext_id):
|
2017-01-28 13:12:47 +00:00
|
|
|
res = None
|
|
|
|
try:
|
2017-07-12 15:57:16 +00:00
|
|
|
pages = []
|
|
|
|
|
2017-04-14 18:56:39 +00:00
|
|
|
google_dos_protection()
|
2017-01-28 13:12:47 +00:00
|
|
|
res = requests.post(
|
|
|
|
const_support_url(),
|
2017-02-05 14:11:09 +00:00
|
|
|
data=const_support_payload(ext_id, "0", "100"),
|
|
|
|
timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* support page 0-100: {}".format(str(res.status_code)), 2,
|
|
|
|
ext_id)
|
2017-03-16 08:30:33 +00:00
|
|
|
store_request_text(tar, date, 'support000-099.text', res)
|
2017-07-12 15:57:16 +00:00
|
|
|
pages += [res.text]
|
|
|
|
|
2017-01-28 13:12:47 +00:00
|
|
|
google_dos_protection()
|
|
|
|
res = requests.post(
|
|
|
|
const_support_url(),
|
2017-02-05 14:11:09 +00:00
|
|
|
data=const_support_payload(ext_id, "100", "100"),
|
|
|
|
timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* support page 100-200: {}".format(str(res.status_code)), 2,
|
|
|
|
ext_id)
|
2017-03-16 08:30:33 +00:00
|
|
|
store_request_text(tar, date, 'support100-199.text', res)
|
2017-07-12 15:57:16 +00:00
|
|
|
pages += [res.text]
|
|
|
|
|
|
|
|
google_dos_protection()
|
|
|
|
# Always start with reply number 0 and request 10 replies
|
|
|
|
ext_id_author_tups = [(ext_id, author, 0, 10, groups)
|
|
|
|
for author, groups in iterate_authors(pages)]
|
2017-07-16 19:14:50 +00:00
|
|
|
if ext_id_author_tups:
|
|
|
|
res = requests.post(
|
|
|
|
const_review_search_url(),
|
|
|
|
data=const_review_search_payload(ext_id_author_tups),
|
|
|
|
timeout=10)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* support page replies: {}".format(str(res.status_code)),
|
|
|
|
2, ext_id)
|
2017-07-16 19:14:50 +00:00
|
|
|
store_request_text(tar, date, 'supportreplies.text', res)
|
2017-01-28 13:12:47 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("Exception when updating support pages", 2, ext_id)
|
2017-07-17 13:00:39 +00:00
|
|
|
write_text(tar, date, 'support.html.exception', traceback.format_exc())
|
2017-08-29 21:29:38 +00:00
|
|
|
return RequestResult(res, e)
|
|
|
|
return RequestResult(res)
|
2017-01-28 13:12:47 +00:00
|
|
|
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
def update_extension(archivedir, forums, ext_id):
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("Updating extension {}".format(" (including forums)"
|
|
|
|
if forums else ""), 1, ext_id)
|
2017-01-29 13:29:46 +00:00
|
|
|
is_new = False
|
2017-03-23 20:32:51 +00:00
|
|
|
tar_exception = None
|
2017-06-16 23:43:40 +00:00
|
|
|
sql_exception = None
|
2017-06-17 17:15:08 +00:00
|
|
|
sql_success = False
|
2017-04-02 08:23:13 +00:00
|
|
|
tmptardir = ""
|
2017-07-26 06:29:48 +00:00
|
|
|
start = time.time()
|
2017-04-02 08:23:13 +00:00
|
|
|
|
2017-02-03 09:45:51 +00:00
|
|
|
date = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
2017-03-13 07:02:17 +00:00
|
|
|
|
2017-03-16 08:30:33 +00:00
|
|
|
tardir = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id)
|
|
|
|
tar = (tardir + ".tar")
|
2017-04-03 20:32:53 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
tmpdir = tempfile.mkdtemp()
|
2017-04-05 20:42:00 +00:00
|
|
|
tmptardir = os.path.join(tmpdir, ext_id)
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info("* tmptardir = {}".format(tmptardir), 2, ext_id)
|
2017-04-03 20:32:53 +00:00
|
|
|
os.makedirs(
|
2017-04-05 20:42:00 +00:00
|
|
|
os.path.join(archivedir, get_local_archive_dir(ext_id)),
|
|
|
|
exist_ok=True)
|
2017-04-03 20:32:53 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("* FATAL: cannot create tmpdir", 3, ext_id)
|
2017-04-03 20:32:53 +00:00
|
|
|
tar_exception = e
|
2017-08-29 21:29:38 +00:00
|
|
|
return UpdateResult(ext_id, is_new, tar_exception, None, None, None,
|
|
|
|
None, sql_exception, False)
|
|
|
|
|
|
|
|
res_overview = update_overview(tmptardir, date, ext_id)
|
2017-01-28 13:12:47 +00:00
|
|
|
res_reviews = None
|
|
|
|
res_support = None
|
|
|
|
if forums:
|
2017-08-29 21:29:38 +00:00
|
|
|
res_reviews = update_reviews(tmptardir, date, ext_id)
|
2017-04-13 08:34:33 +00:00
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
res_crx = update_crx(archivedir, tmptardir, ext_id, date)
|
2017-04-13 08:34:33 +00:00
|
|
|
|
2017-04-12 07:59:08 +00:00
|
|
|
if forums:
|
2017-08-29 21:29:38 +00:00
|
|
|
res_support = update_support(tmptardir, date, ext_id)
|
2017-03-26 20:30:07 +00:00
|
|
|
|
2017-05-20 20:09:03 +00:00
|
|
|
backup = False
|
|
|
|
if backup:
|
2017-03-28 05:38:12 +00:00
|
|
|
try:
|
2017-05-20 20:09:03 +00:00
|
|
|
os.sync()
|
|
|
|
if os.path.exists(tardir + "bak.tar"):
|
2017-05-27 19:38:56 +00:00
|
|
|
shutil.move(tardir + ".bak.tar",
|
|
|
|
tardir + ".bak." + date + ".tar")
|
2017-05-20 20:09:03 +00:00
|
|
|
os.remove(tardir + ".bak." + date + ".tar")
|
2017-03-28 05:38:12 +00:00
|
|
|
except Exception:
|
|
|
|
pass
|
2017-04-03 20:32:53 +00:00
|
|
|
|
2017-05-20 20:09:03 +00:00
|
|
|
try:
|
|
|
|
if os.path.exists(tar):
|
|
|
|
shutil.copyfile(tar, tardir + ".bak.tar")
|
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("* FATAL: cannot rename old tar archive", 3, ext_id)
|
2017-05-20 20:09:03 +00:00
|
|
|
tar_exception = e
|
|
|
|
try:
|
2017-05-27 19:38:56 +00:00
|
|
|
write_text(tardir, date, ext_id + ".tar.rename.exception",
|
2017-07-17 13:00:39 +00:00
|
|
|
traceback.format_exc())
|
2017-05-20 20:09:03 +00:00
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
2017-04-07 06:07:49 +00:00
|
|
|
if not os.path.exists(tar):
|
2017-04-13 08:34:33 +00:00
|
|
|
is_new = True
|
2017-03-28 05:38:12 +00:00
|
|
|
try:
|
2017-10-14 18:59:46 +00:00
|
|
|
with tarfile.open(tar, mode='a:') as ar:
|
|
|
|
ar.add(tmptardir, arcname=ext_id)
|
2017-03-28 05:38:12 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("* FATAL: cannot create tar archive", 3, ext_id)
|
2017-03-28 05:38:12 +00:00
|
|
|
tar_exception = e
|
|
|
|
try:
|
2017-07-17 13:00:39 +00:00
|
|
|
write_text(tardir, date, ext_id + ".tar.create.exception",
|
|
|
|
traceback.format_exc())
|
2017-03-28 05:38:12 +00:00
|
|
|
except Exception:
|
|
|
|
pass
|
2017-06-16 11:07:51 +00:00
|
|
|
|
2017-06-16 19:40:48 +00:00
|
|
|
try:
|
2017-08-30 14:38:04 +00:00
|
|
|
update_db_incremental(tmptardir, ext_id, date)
|
2017-06-19 15:42:35 +00:00
|
|
|
sql_success = True
|
2017-06-16 19:40:48 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:38:04 +00:00
|
|
|
log_exception("* Exception during update of db", 3, ext_id)
|
2017-06-16 23:43:40 +00:00
|
|
|
sql_exception = e
|
2017-06-16 19:40:48 +00:00
|
|
|
|
2017-06-16 23:43:40 +00:00
|
|
|
try:
|
2017-07-17 13:00:39 +00:00
|
|
|
write_text(tardir, date, ext_id + ".sql.exception",
|
|
|
|
traceback.format_exc())
|
2017-06-16 23:43:40 +00:00
|
|
|
except Exception as e:
|
|
|
|
pass
|
2017-03-26 20:30:07 +00:00
|
|
|
try:
|
2017-04-03 20:32:53 +00:00
|
|
|
shutil.rmtree(path=tmpdir)
|
2017-03-28 05:38:12 +00:00
|
|
|
except Exception as e:
|
2017-08-30 14:12:54 +00:00
|
|
|
log_exception("* FATAL: cannot remove archive directory", 3, ext_id)
|
2017-03-28 05:38:12 +00:00
|
|
|
tar_exception = e
|
|
|
|
try:
|
2017-07-17 13:00:39 +00:00
|
|
|
write_text(tardir, date, ext_id + ".dir.remove.exception",
|
|
|
|
traceback.format_exc())
|
2017-03-28 05:38:12 +00:00
|
|
|
except Exception:
|
|
|
|
pass
|
2017-03-26 20:30:07 +00:00
|
|
|
|
2017-08-30 14:12:54 +00:00
|
|
|
log_info(
|
|
|
|
"* Duration: {}".format(
|
|
|
|
datetime.timedelta(seconds=int(time.time() - start))),
|
|
|
|
2,
|
|
|
|
ext_id)
|
2017-03-25 06:31:48 +00:00
|
|
|
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
|
2017-06-17 17:15:08 +00:00
|
|
|
res_reviews, res_support, sql_exception, sql_success)
|
2017-01-28 13:12:47 +00:00
|
|
|
|
2017-03-16 08:30:33 +00:00
|
|
|
|
2018-04-09 17:38:51 +00:00
|
|
|
def execute_parallel_ProcessPool(archivedir, max_retry, timeout, max_workers, ext_ids, forums):
|
2018-04-06 19:32:24 +00:00
|
|
|
results=[]
|
2018-04-07 12:17:33 +00:00
|
|
|
for n in range(max_retry):
|
|
|
|
if n > 0:
|
2018-04-09 17:38:51 +00:00
|
|
|
log_info("Attempt ({} out of {}): {} extensions".format(
|
|
|
|
n + 1, max_retry,len(ext_timeouts)), 1)
|
2018-04-07 12:17:33 +00:00
|
|
|
ext_ids=ext_timeouts
|
|
|
|
|
2018-04-08 16:44:59 +00:00
|
|
|
ext_timeouts=[]
|
2018-04-07 20:13:32 +00:00
|
|
|
with ProcessPool(max_workers=max_workers, max_tasks=1000) as pool:
|
2018-04-09 17:38:51 +00:00
|
|
|
future = pool.map(partial(update_extension, archivedir, forums)
|
2018-04-07 12:17:33 +00:00
|
|
|
,ext_ids
|
|
|
|
,timeout=timeout)
|
|
|
|
# wait for pool is finished before processing results.
|
|
|
|
iterator = future.result()
|
|
|
|
ext_timeouts=[]
|
|
|
|
for ext_id in ext_ids:
|
|
|
|
try:
|
|
|
|
results.append(next(iterator))
|
|
|
|
except StopIteration:
|
|
|
|
break
|
|
|
|
except TimeoutError as error:
|
2018-04-08 16:44:59 +00:00
|
|
|
log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id,error.args[1]))
|
2018-04-07 12:17:33 +00:00
|
|
|
ext_timeouts.append(ext_id)
|
|
|
|
except ProcessExpired as error:
|
2018-04-08 16:44:59 +00:00
|
|
|
log_warning("WorkerException: %s (%s)self. Exit code: %d" % (error, ext_id, error.exitcode))
|
2018-04-07 12:17:33 +00:00
|
|
|
ext_timeouts.append(ext_id)
|
|
|
|
except Exception as error:
|
2018-04-08 16:44:59 +00:00
|
|
|
log_warning("WorkerException: Processing %s raised %s" % (ext_id, error))
|
|
|
|
log_warning(error.traceback) # Python's traceback of remote process
|
2018-04-07 12:17:33 +00:00
|
|
|
ext_timeouts.append(ext_id)
|
2018-04-08 16:44:59 +00:00
|
|
|
|
2018-04-06 19:32:24 +00:00
|
|
|
return results
|
|
|
|
|
2018-04-09 17:38:51 +00:00
|
|
|
def execute_parallel_Pool(archivedir, max_retry, timeout, max_workers, ext_ids, forums):
|
2018-04-08 09:06:26 +00:00
|
|
|
log_info("Using multiprocessing.Pool: timeout and max_try are *not* supported")
|
|
|
|
with Pool(processes=max_workers, maxtasksperchild=1000) as pool:
|
2018-04-09 17:38:51 +00:00
|
|
|
results = pool.map(partial(update_extension, archivedir, forums)
|
2018-04-08 16:44:59 +00:00
|
|
|
,ext_ids)
|
2018-04-08 09:06:26 +00:00
|
|
|
return list(results)
|
2018-04-06 19:32:24 +00:00
|
|
|
|
2018-04-08 09:06:26 +00:00
|
|
|
|
|
|
|
def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, use_process_pool=False):
|
2018-04-06 19:32:24 +00:00
|
|
|
ext_with_forums = []
|
|
|
|
ext_without_forums = []
|
|
|
|
forums_ext_ids = (list(set(forums_ext_ids)))
|
2018-04-08 09:06:26 +00:00
|
|
|
if use_process_pool:
|
|
|
|
execute_parallel=execute_parallel_ProcessPool
|
|
|
|
else:
|
|
|
|
execute_parallel=execute_parallel_Pool
|
|
|
|
|
2018-04-06 19:32:24 +00:00
|
|
|
log_info("Updating {} extensions ({} including forums)".format(
|
|
|
|
len(ext_ids), len(forums_ext_ids)))
|
|
|
|
|
|
|
|
# First, update all extensions without forums in parallel (increased speed).
|
|
|
|
# parallel_ids = list(set(ext_ids) - set(forums_ext_ids))
|
|
|
|
parallel_ids = ext_ids
|
|
|
|
log_info("Updating {} extensions excluding forums (parallel)".format(
|
|
|
|
len(parallel_ids)), 1)
|
2018-04-09 17:38:51 +00:00
|
|
|
ext_without_forums = execute_parallel(archivedir,3, timeout, parallel, parallel_ids, False)
|
2018-04-06 19:32:24 +00:00
|
|
|
|
|
|
|
# Second, update extensions with forums sequentially (and with delays) to
|
|
|
|
# avoid running into Googles DDOS detection.
|
|
|
|
log_info("Updating {} extensions including forums (sequentially)".format(
|
|
|
|
len(forums_ext_ids)), 1)
|
2018-04-09 17:38:51 +00:00
|
|
|
ext_with_forums = execute_parallel(archivedir, 3, timeout, 1, forums_ext_ids, True)
|
2018-04-08 16:44:59 +00:00
|
|
|
|
2017-01-28 17:22:28 +00:00
|
|
|
return ext_with_forums + ext_without_forums
|
|
|
|
|
2017-01-28 13:12:47 +00:00
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
def get_existing_ids(archivedir):
|
2017-01-28 13:12:47 +00:00
|
|
|
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
|
|
|
|
word = byte + byte + byte + byte
|
|
|
|
return list(
|
2017-07-29 15:13:39 +00:00
|
|
|
map(lambda d: re.sub(".tar$", "", re.sub(r"^.*\/", "", d)),
|
2017-03-16 08:30:33 +00:00
|
|
|
glob.glob(os.path.join(archivedir, "*", word + ".tar"))))
|
2017-01-28 13:12:47 +00:00
|
|
|
|
|
|
|
|
2017-08-29 21:29:38 +00:00
|
|
|
def get_forum_ext_ids(confdir):
|
2017-01-28 13:12:47 +00:00
|
|
|
with open(os.path.join(confdir, "forums.conf")) as f:
|
|
|
|
ids = f.readlines()
|
2017-06-16 22:32:52 +00:00
|
|
|
r = re.compile('^[a-p]+$')
|
2017-01-28 13:12:47 +00:00
|
|
|
ids = [x.strip() for x in ids]
|
2017-06-16 23:43:40 +00:00
|
|
|
return list(filter(r.match, ids))
|