Refactoring: clean-up of imports and a few other minor improvements.
This commit is contained in:
parent
eb0054b47d
commit
5268f2a732
|
@ -13,43 +13,47 @@
|
||||||
# GNU General Public License for more details.
|
# GNU General Public License for more details.
|
||||||
#
|
#
|
||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Module for handling archives of the Browser Extension Crawler.
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import glob
|
import glob
|
||||||
import re
|
import re
|
||||||
import requests
|
import json
|
||||||
from time import sleep
|
|
||||||
from random import randint
|
|
||||||
import datetime
|
|
||||||
from ExtensionCrawler.config import *
|
|
||||||
from ExtensionCrawler.util import *
|
|
||||||
from ExtensionCrawler.archive import archive_file
|
|
||||||
from ExtensionCrawler.sqlite import *
|
|
||||||
import dateutil
|
|
||||||
import dateutil.parser
|
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import shutil
|
import shutil
|
||||||
import tarfile
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import dateutil.parser
|
|
||||||
import traceback
|
import traceback
|
||||||
|
import tarfile
|
||||||
|
import datetime
|
||||||
|
import dateutil
|
||||||
|
import dateutil.parser
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from ExtensionCrawler.config import (const_review_payload, const_review_search_url,
|
||||||
|
const_download_url, get_local_archive_dir,
|
||||||
|
const_overview_url, const_support_url,
|
||||||
|
const_support_payload, const_review_search_payload,
|
||||||
|
const_review_url)
|
||||||
|
from ExtensionCrawler.util import logmsg, google_dos_protection, log, value_of
|
||||||
|
from ExtensionCrawler.sqlite import db_file, update_sqlite_incremental
|
||||||
|
|
||||||
|
|
||||||
class Error(Exception):
|
class Error(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class CrawlError(Error):
|
class CrawlError(Error):
|
||||||
def __init__(self, extid, message, pagecontent=""):
|
def __init__(self, extid, message, pagecontent=""):
|
||||||
self.extid = extid
|
self.extid = extid
|
||||||
self.message = message
|
self.message = message
|
||||||
self.pagecontent = pagecontent
|
self.pagecontent = pagecontent
|
||||||
|
super(CrawlError, self).__init__()
|
||||||
|
|
||||||
class RequestResult:
|
class RequestResult:
|
||||||
def __init__(self, response=None, exception=None):
|
def __init__(self, response=None, exception=None):
|
||||||
|
@ -73,7 +77,7 @@ class RequestResult:
|
||||||
return (self.exception is None) and (self.http_status == 503)
|
return (self.exception is None) and (self.http_status == 503)
|
||||||
|
|
||||||
def not_modified(self):
|
def not_modified(self):
|
||||||
return ((self.exception is None) and (self.http_status == 304))
|
return (self.exception is None) and (self.http_status == 304)
|
||||||
|
|
||||||
|
|
||||||
class UpdateResult:
|
class UpdateResult:
|
||||||
|
@ -95,16 +99,16 @@ class UpdateResult:
|
||||||
def is_ok(self):
|
def is_ok(self):
|
||||||
return (self.res_overview.is_ok() and
|
return (self.res_overview.is_ok() and
|
||||||
(self.res_crx.is_ok() or self.res_crx.not_modified()) and
|
(self.res_crx.is_ok() or self.res_crx.not_modified()) and
|
||||||
((self.res_reviews is None) or self.res_reviews.is_ok()) and
|
((self.res_reviews is None) or self.res_reviews.is_ok()) and (
|
||||||
((self.res_support is None) or self.res_support.is_ok()))
|
(self.res_support is None) or self.res_support.is_ok()))
|
||||||
|
|
||||||
def not_authorized(self):
|
def not_authorized(self):
|
||||||
return (self.res_overview.not_authorized() or
|
return (self.res_overview.not_authorized() or
|
||||||
self.res_crx.not_authorized() or
|
self.res_crx.not_authorized() or
|
||||||
(self.res_reviews is not None and
|
(self.res_reviews is not None and
|
||||||
self.res_reviews.not_authorized()) or
|
self.res_reviews.not_authorized()) or (
|
||||||
(self.res_support is not None and
|
self.res_support is not None and
|
||||||
self.res_support.not_authorized()))
|
self.res_support.not_authorized()))
|
||||||
|
|
||||||
def not_in_store(self):
|
def not_in_store(self):
|
||||||
return (
|
return (
|
||||||
|
@ -116,9 +120,9 @@ class UpdateResult:
|
||||||
return (self.res_overview.has_exception() or
|
return (self.res_overview.has_exception() or
|
||||||
self.res_crx.has_exception() or
|
self.res_crx.has_exception() or
|
||||||
(self.res_reviews is not None and
|
(self.res_reviews is not None and
|
||||||
self.res_reviews.has_exception()) or
|
self.res_reviews.has_exception()) or (
|
||||||
(self.res_support is not None and
|
self.res_support is not None and
|
||||||
self.res_support.has_exception()))
|
self.res_support.has_exception()))
|
||||||
|
|
||||||
def raised_google_ddos(self):
|
def raised_google_ddos(self):
|
||||||
return ((self.res_reviews is not None and
|
return ((self.res_reviews is not None and
|
||||||
|
@ -140,9 +144,9 @@ class UpdateResult:
|
||||||
|
|
||||||
|
|
||||||
def write_text(tardir, date, fname, text):
|
def write_text(tardir, date, fname, text):
|
||||||
dir = os.path.join(tardir, date)
|
directory = os.path.join(tardir, date)
|
||||||
os.makedirs(dir, exist_ok=True)
|
os.makedirs(directory, exist_ok=True)
|
||||||
with open(os.path.join(dir, fname), 'w') as f:
|
with open(os.path.join(directory, fname), 'w') as f:
|
||||||
f.write(text)
|
f.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
@ -163,9 +167,8 @@ def httpdate(dt):
|
||||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
|
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
|
||||||
"Nov", "Dec"
|
"Nov", "Dec"
|
||||||
][dt.month - 1]
|
][dt.month - 1]
|
||||||
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (weekday, dt.day, month,
|
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
|
||||||
dt.year, dt.hour,
|
weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
|
||||||
dt.minute, dt.second)
|
|
||||||
|
|
||||||
|
|
||||||
def last_modified_utc_date(path):
|
def last_modified_utc_date(path):
|
||||||
|
@ -188,9 +191,9 @@ def last_crx(archivedir, extid, date=None):
|
||||||
t = tarfile.open(tar, 'r')
|
t = tarfile.open(tar, 'r')
|
||||||
old_crxs = sorted([
|
old_crxs = sorted([
|
||||||
x.name for x in t.getmembers()
|
x.name for x in t.getmembers()
|
||||||
if x.name.endswith(".crx") and x.size > 0 and
|
if x.name.endswith(".crx") and x.size > 0 and (date is None or (
|
||||||
(date is None or (dateutil.parser.parse(
|
dateutil.parser.parse(
|
||||||
os.path.split(os.path.split(x.name)[0])[1]) <= date))
|
os.path.split(os.path.split(x.name)[0])[1]) <= date))
|
||||||
])
|
])
|
||||||
t.close()
|
t.close()
|
||||||
if old_crxs != []:
|
if old_crxs != []:
|
||||||
|
@ -209,7 +212,7 @@ def last_etag(archivedir, extid, crxfile):
|
||||||
headers = eval((t.extractfile(crxfile + ".headers")).read())
|
headers = eval((t.extractfile(crxfile + ".headers")).read())
|
||||||
etag = headers['ETag']
|
etag = headers['ETag']
|
||||||
t.close()
|
t.close()
|
||||||
except Exception as e:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
return etag
|
return etag
|
||||||
|
|
||||||
|
@ -260,11 +263,10 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
|
||||||
if last_crx_file is not "":
|
if last_crx_file is not "":
|
||||||
headers = {'If-Modified-Since': last_crx_http_date}
|
headers = {'If-Modified-Since': last_crx_http_date}
|
||||||
try:
|
try:
|
||||||
res = requests.get(
|
res = requests.get(const_download_url().format(ext_id),
|
||||||
const_download_url().format(ext_id),
|
stream=True,
|
||||||
stream=True,
|
headers=headers,
|
||||||
headers=headers,
|
timeout=10)
|
||||||
timeout=10)
|
|
||||||
logtxt = logmsg(verbose, logtxt, "{}\n".format(str(res.status_code)))
|
logtxt = logmsg(verbose, logtxt, "{}\n".format(str(res.status_code)))
|
||||||
extfilename = os.path.basename(res.url)
|
extfilename = os.path.basename(res.url)
|
||||||
if re.search('&', extfilename):
|
if re.search('&', extfilename):
|
||||||
|
@ -276,21 +278,19 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
|
||||||
timeout=10,
|
timeout=10,
|
||||||
allow_redirects=True).headers.get('ETag')
|
allow_redirects=True).headers.get('ETag')
|
||||||
write_text(tmptardir, date, extfilename + ".etag", etag)
|
write_text(tmptardir, date, extfilename + ".etag", etag)
|
||||||
logtxt = logmsg(
|
logtxt = logmsg(verbose, logtxt, (
|
||||||
verbose, logtxt,
|
" - checking etag, last: {}\n" +
|
||||||
(" - checking etag, last: {}\n" +
|
" current: {}\n").format(
|
||||||
" current: {}\n").format(
|
last_crx_etag, etag))
|
||||||
last_crx_etag, etag))
|
|
||||||
|
|
||||||
if ((etag is not "") and (etag != last_crx_etag)):
|
if (etag is not "") and (etag != last_crx_etag):
|
||||||
logtxt = logmsg(
|
logtxt = logmsg(
|
||||||
verbose, logtxt,
|
verbose, logtxt,
|
||||||
" - downloading due to different etags\n")
|
" - downloading due to different etags\n")
|
||||||
|
|
||||||
res = requests.get(
|
res = requests.get(const_download_url().format(ext_id),
|
||||||
const_download_url().format(ext_id),
|
stream=True,
|
||||||
stream=True,
|
timeout=10)
|
||||||
timeout=10)
|
|
||||||
else:
|
else:
|
||||||
write_text(tmptardir, date, extfilename + ".link",
|
write_text(tmptardir, date, extfilename + ".link",
|
||||||
os.path.join("..",
|
os.path.join("..",
|
||||||
|
@ -319,7 +319,8 @@ def iterate_authors(pages):
|
||||||
for page in pages:
|
for page in pages:
|
||||||
json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
|
json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
|
||||||
for annotation in json_page["annotations"]:
|
for annotation in json_page["annotations"]:
|
||||||
if "attributes" in annotation and "replyExists" in annotation["attributes"] and annotation["attributes"]["replyExists"]:
|
if "attributes" in annotation and "replyExists" in annotation[
|
||||||
|
"attributes"] and annotation["attributes"]["replyExists"]:
|
||||||
yield (annotation["entity"]["author"],
|
yield (annotation["entity"]["author"],
|
||||||
annotation["entity"]["groups"])
|
annotation["entity"]["groups"])
|
||||||
|
|
||||||
|
@ -423,7 +424,6 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
||||||
sql_exception = None
|
sql_exception = None
|
||||||
sql_success = False
|
sql_success = False
|
||||||
tmptardir = ""
|
tmptardir = ""
|
||||||
tmptar = ""
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
if forums:
|
if forums:
|
||||||
|
@ -437,7 +437,6 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
||||||
try:
|
try:
|
||||||
tmpdir = tempfile.mkdtemp()
|
tmpdir = tempfile.mkdtemp()
|
||||||
tmptardir = os.path.join(tmpdir, ext_id)
|
tmptardir = os.path.join(tmpdir, ext_id)
|
||||||
tmptar = tmptardir + ".tar"
|
|
||||||
logtxt = logmsg(verbose, logtxt,
|
logtxt = logmsg(verbose, logtxt,
|
||||||
" * tmptardir = {}\n".format(tmptardir))
|
" * tmptardir = {}\n".format(tmptardir))
|
||||||
os.makedirs(
|
os.makedirs(
|
||||||
|
@ -448,11 +447,14 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
||||||
" * FATAL: cannot create tmpdir")
|
" * FATAL: cannot create tmpdir")
|
||||||
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
|
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
|
||||||
tar_exception = e
|
tar_exception = e
|
||||||
logtxt = logmsg(verbose, logtxt,
|
logtxt = logmsg(
|
||||||
" * Duration: {}\n".format(datetime.timedelta(seconds=int(time.time()-start))))
|
verbose,
|
||||||
|
logtxt,
|
||||||
|
" * Duration: {}\n".format(
|
||||||
|
datetime.timedelta(seconds=int(time.time() - start))))
|
||||||
log(verbose, logtxt)
|
log(verbose, logtxt)
|
||||||
return UpdateResult(ext_id, is_new, tar_exception, res_overview,
|
return UpdateResult(ext_id, is_new, tar_exception, None,
|
||||||
res_crx, res_reviews, res_support, sql_exception,
|
None, None, None, sql_exception,
|
||||||
False)
|
False)
|
||||||
|
|
||||||
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
|
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
|
||||||
|
@ -549,8 +551,11 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
logtxt = logmsg(verbose, logtxt,
|
logtxt = logmsg(
|
||||||
" * Duration: {}\n".format(datetime.timedelta(seconds=int(time.time()-start))))
|
verbose,
|
||||||
|
logtxt,
|
||||||
|
" * Duration: {}\n".format(
|
||||||
|
datetime.timedelta(seconds=int(time.time() - start))))
|
||||||
log(verbose, logtxt)
|
log(verbose, logtxt)
|
||||||
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
|
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
|
||||||
res_reviews, res_support, sql_exception, sql_success)
|
res_reviews, res_support, sql_exception, sql_success)
|
||||||
|
@ -592,7 +597,7 @@ def get_existing_ids(archivedir, verbose):
|
||||||
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
|
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
|
||||||
word = byte + byte + byte + byte
|
word = byte + byte + byte + byte
|
||||||
return list(
|
return list(
|
||||||
map(lambda d: re.sub(".tar$", "", re.sub("^.*\/", "", d)),
|
map(lambda d: re.sub(".tar$", "", re.sub(r"^.*\/", "", d)),
|
||||||
glob.glob(os.path.join(archivedir, "*", word + ".tar"))))
|
glob.glob(os.path.join(archivedir, "*", word + ".tar"))))
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue