Refactoring.

This commit is contained in:
Achim D. Brucker 2017-01-28 13:12:47 +00:00
parent 23e1147370
commit 3ed43f036d
3 changed files with 190 additions and 181 deletions

View File

@ -24,7 +24,9 @@ import requests
from time import sleep from time import sleep
from random import randint from random import randint
from datetime import datetime, timezone from datetime import datetime, timezone
from ExtensionCrawler.config import *
from ExtensionCrawler.util import * from ExtensionCrawler.util import *
from ExtensionCrawler.archive import *
import dateutil import dateutil
import dateutil.parser import dateutil.parser
@ -161,3 +163,178 @@ def last_crx(dir, extid):
return last_archive return last_archive
def update_overview(dir, verbose, ext_id):
log(verbose, " * overview page: ")
try:
res = requests.get(const_overview_url(ext_id))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'overview.html', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'overview.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def validate_crx_response(res, extfilename):
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
if not 'Content-Type' in res.headers:
raise CrawlError(extid, 'Did not find Content-Type header.',
'\n'.join(res.iter_lines()))
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
text = [line.decode('utf-8') for line in res.iter_lines()]
raise CrawlError(
extid,
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
format(res.headers['Content-Type']), '\n'.join(text))
if not regex_extfilename.match(extfilename):
raise CrawlError(
extid, '{} is not a valid extension file name, skipping...'.format(
extfilename))
def update_crx(dir, verbose, ext_id):
last_crx_file = last_crx(dir, ext_id)
last_crx_http_date = last_modified_http_date(last_crx_file)
log(verbose, " * crx archive (Last: {}): ".format(
valueOf(last_crx_http_date, "n/a")))
headers = ""
if last_crx_file is not "":
headers = {'If-Modified-Since': last_crx_http_date}
try:
res = requests.get(
const_download_url().format(ext_id),
stream=True,
headers=headers)
log(verbose, "{}".format(str(res.status_code)))
extfilename = os.path.basename(res.url)
store_request_metadata(dir, extfilename, res)
if res.status_code == 304:
write_text(dir, extfilename + ".link",
os.path.join("..",
last_modified_utc_date(last_crx_file),
extfilename) + "\n")
else:
validate_crx_response(res, extfilename)
with open(os.path.join(dir, extfilename), 'wb') as f:
for chunk in res.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, extfilename + ".exception", str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_reviews(dir, verbose, ext_id):
log(verbose, " * review page: ")
res = None
try:
google_dos_protection()
res = requests.post(
const_review_url(),
data=const_review_payload(ext_id, "0",
"100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'reviews000-099.text', res)
google_dos_protection()
res = requests.post(
const_review_url(),
data=const_review_payload(ext_id, "0",
"100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'reviews100-199.text', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'reviews.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_support(dir, verbose, ext_id):
log(verbose, " * support page: ")
res = None
try:
google_dos_protection()
res = requests.post(
const_support_url(),
data=const_support_payload(ext_id, "0",
"100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'support000-099.text', res)
google_dos_protection()
res = requests.post(
const_support_url(),
data=const_support_payload(ext_id, "100",
"100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'support100-199.text', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'support.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_extension(archivedir, verbose, forums, ext_id):
log(verbose, " Updating {}".format(ext_id))
if forums:
log(verbose, " (including forums)")
log(verbose, "\n")
date = datetime.now(timezone.utc).isoformat()
dir = os.path.join(
os.path.join(archivedir,
get_local_archive_dir(ext_id)),
date)
os.makedirs(dir, exist_ok=True)
res_overview = update_overview(dir, verbose, ext_id)
res_crx = update_crx(dir, verbose, ext_id)
res_reviews = None
res_support = None
if forums:
res_reviews = update_reviews(dir, verbose, ext_id)
res_support = update_support(dir, verbose, ext_id)
return UpdateResult(ext_id, res_overview, res_crx, res_reviews,
res_support)
def update_extensions(archivedir, verbose, forums_ext_ids, known_ext_ids,
new_ext_ids):
def update_forums(ext_id):
return (ext_id in forums_ext_ids)
ext_ids = known_ext_ids + new_ext_ids
log(verbose,
"Updating {} extensions ({} new, {} including forums)\n".format(
len(ext_ids), len(new_ext_ids), len(forums_ext_ids)))
return list(map(lambda ext_id: update_extension(archivedir, verbose,
update_forums(ext_id), ext_id),
ext_ids))
def get_existing_ids(archivedir, verbose):
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
word = byte + byte + byte + byte
return list(
map(lambda d: re.sub("^.*\/", "", d),
glob.glob(os.path.join(archivedir, "*", word))))
def get_forum_ext_ids(confdir, verbose):
with open(os.path.join(confdir, "forums.conf")) as f:
ids = f.readlines()
ids = [x.strip() for x in ids]
return ids

View File

@ -22,6 +22,8 @@ import requests
import re import re
from functools import reduce from functools import reduce
import ExtensionCrawler.config import ExtensionCrawler.config
from ExtensionCrawler.util import *
def crawl_nearly_all_of_ext_ids(): def crawl_nearly_all_of_ext_ids():
def get_inner_elems(doc): def get_inner_elems(doc):
@ -45,3 +47,12 @@ def crawl_nearly_all_of_ext_ids():
lambda x, y: x + y, lambda x, y: x + y,
map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), []) map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls] return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
def get_new_ids(verbose, known_ids):
log(verbose, "Discovering new ids ... \n")
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
new_ids = list(set(discovered_ids) - set(known_ids))
log(verbose, " Discovered {} new extensions (out of {})\n".format(
len(new_ids), len(discovered_ids)))
return new_ids

183
crawler
View File

@ -27,190 +27,11 @@ from datetime import datetime, timezone
from ExtensionCrawler.discover import * from ExtensionCrawler.discover import *
from ExtensionCrawler.archive import * from ExtensionCrawler.archive import *
from ExtensionCrawler.util import * from ExtensionCrawler.util import *
from ExtensionCrawler.discover import *
import dateutil import dateutil
import dateutil.parser import dateutil.parser
def update_overview(dir, verbose, ext_id):
log(verbose, " * overview page: ")
try:
res = requests.get(ExtensionCrawler.config.const_overview_url(ext_id))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'overview.html', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'overview.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def validate_crx_response(res, extfilename):
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
if not 'Content-Type' in res.headers:
raise CrawlError(extid, 'Did not find Content-Type header.',
'\n'.join(res.iter_lines()))
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
text = [line.decode('utf-8') for line in res.iter_lines()]
raise CrawlError(
extid,
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
format(res.headers['Content-Type']), '\n'.join(text))
if not regex_extfilename.match(extfilename):
raise CrawlError(
extid, '{} is not a valid extension file name, skipping...'.format(
extfilename))
def update_crx(dir, verbose, ext_id):
last_crx_file = last_crx(dir, ext_id)
last_crx_http_date = last_modified_http_date(last_crx_file)
log(verbose, " * crx archive (Last: {}): ".format(
valueOf(last_crx_http_date, "n/a")))
headers = ""
if last_crx_file is not "":
headers = {'If-Modified-Since': last_crx_http_date}
try:
res = requests.get(
ExtensionCrawler.config.const_download_url().format(ext_id),
stream=True,
headers=headers)
log(verbose, "{}".format(str(res.status_code)))
extfilename = os.path.basename(res.url)
store_request_metadata(dir, extfilename, res)
if res.status_code == 304:
write_text(dir, extfilename + ".link",
os.path.join("..",
last_modified_utc_date(last_crx_file),
extfilename) + "\n")
else:
validate_crx_response(res, extfilename)
with open(os.path.join(dir, extfilename), 'wb') as f:
for chunk in res.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, extfilename + ".exception", str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_reviews(dir, verbose, ext_id):
log(verbose, " * review page: ")
res = None
try:
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_review_url(),
data=ExtensionCrawler.config.const_review_payload(ext_id, "0",
"100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'reviews000-099.text', res)
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_review_url(),
data=ExtensionCrawler.config.const_review_payload(ext_id, "0",
"100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'reviews100-199.text', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'reviews.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_support(dir, verbose, ext_id):
log(verbose, " * support page: ")
res = None
try:
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_support_url(),
data=ExtensionCrawler.config.const_support_payload(ext_id, "0",
"100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'support000-099.text', res)
google_dos_protection()
res = requests.post(
ExtensionCrawler.config.const_support_url(),
data=ExtensionCrawler.config.const_support_payload(ext_id, "100",
"100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'support100-199.text', res)
except Exception as e:
log(verbose, " / Exception: {}\n".format(str(e)))
write_text(dir, 'support.html.exception', str(e))
return RequestResult(res, e)
log(verbose, "\n")
return RequestResult(res)
def update_extension(archivedir, verbose, forums, ext_id):
log(verbose, " Updating {}".format(ext_id))
if forums:
log(verbose, " (including forums)")
log(verbose, "\n")
date = datetime.now(timezone.utc).isoformat()
dir = os.path.join(
os.path.join(archivedir,
ExtensionCrawler.archive.get_local_archive_dir(ext_id)),
date)
os.makedirs(dir, exist_ok=True)
res_overview = update_overview(dir, verbose, ext_id)
res_crx = update_crx(dir, verbose, ext_id)
res_reviews = None
res_support = None
if forums:
res_reviews = update_reviews(dir, verbose, ext_id)
res_support = update_support(dir, verbose, ext_id)
return UpdateResult(ext_id, res_overview, res_crx, res_reviews,
res_support)
def update_extensions(archivedir, verbose, forums_ext_ids, known_ext_ids,
new_ext_ids):
def update_forums(ext_id):
return (ext_id in forums_ext_ids)
ext_ids = known_ext_ids + new_ext_ids
log(verbose,
"Updating {} extensions ({} new, {} including forums)\n".format(
len(ext_ids), len(new_ext_ids), len(forums_ext_ids)))
return list(map(lambda ext_id: update_extension(archivedir, verbose,
update_forums(ext_id), ext_id),
ext_ids))
def get_existing_ids(archivedir, verbose):
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
word = byte + byte + byte + byte
return list(
map(lambda d: re.sub("^.*\/", "", d),
glob.glob(os.path.join(archivedir, "*", word))))
def get_forum_ext_ids(confdir, verbose):
with open(os.path.join(confdir, "forums.conf")) as f:
ids = f.readlines()
ids = [x.strip() for x in ids]
return ids
def get_new_ids(verbose, known_ids):
log(verbose, "Discovering new ids ... \n")
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
new_ids = list(set(discovered_ids) - set(known_ids))
log(verbose, " Discovered {} new extensions (out of {})\n".format(
len(new_ids), len(discovered_ids)))
return new_ids
def log_summary(verbose, res): def log_summary(verbose, res):
total = len(res) total = len(res)
success = len(list(filter(lambda x: x.is_ok(), res))) success = len(list(filter(lambda x: x.is_ok(), res)))
@ -225,7 +46,7 @@ def log_summary(verbose, res):
str(success), str(total))) str(success), str(total)))
log(verbose, " Not authorized: {}\n".format(str(not_authorized))) log(verbose, " Not authorized: {}\n".format(str(not_authorized)))
log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos))) log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos)))
log(verbose, " Updated archives: {}\n".format(str(success-not_modified))) log(verbose, " Not modified archives: {}\n".format(str(not_modified)))
log(verbose, " Extensions not in store: {}\n".format(str(not_in_store))) log(verbose, " Extensions not in store: {}\n".format(str(not_in_store)))
log(verbose, " Unknown exception: {}\n".format(str(has_exception))) log(verbose, " Unknown exception: {}\n".format(str(has_exception)))