Refactoring.
This commit is contained in:
parent
23e1147370
commit
3ed43f036d
|
@ -24,7 +24,9 @@ import requests
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from random import randint
|
from random import randint
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
from ExtensionCrawler.config import *
|
||||||
from ExtensionCrawler.util import *
|
from ExtensionCrawler.util import *
|
||||||
|
from ExtensionCrawler.archive import *
|
||||||
import dateutil
|
import dateutil
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
|
|
||||||
|
@ -161,3 +163,178 @@ def last_crx(dir, extid):
|
||||||
return last_archive
|
return last_archive
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def update_overview(dir, verbose, ext_id):
|
||||||
|
log(verbose, " * overview page: ")
|
||||||
|
try:
|
||||||
|
res = requests.get(const_overview_url(ext_id))
|
||||||
|
log(verbose, "{}".format(str(res.status_code)))
|
||||||
|
store_request_text(dir, 'overview.html', res)
|
||||||
|
except Exception as e:
|
||||||
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
||||||
|
write_text(dir, 'overview.html.exception', str(e))
|
||||||
|
return RequestResult(res, e)
|
||||||
|
log(verbose, "\n")
|
||||||
|
return RequestResult(res)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_crx_response(res, extfilename):
|
||||||
|
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
||||||
|
if not 'Content-Type' in res.headers:
|
||||||
|
raise CrawlError(extid, 'Did not find Content-Type header.',
|
||||||
|
'\n'.join(res.iter_lines()))
|
||||||
|
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
|
||||||
|
text = [line.decode('utf-8') for line in res.iter_lines()]
|
||||||
|
raise CrawlError(
|
||||||
|
extid,
|
||||||
|
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
|
||||||
|
format(res.headers['Content-Type']), '\n'.join(text))
|
||||||
|
if not regex_extfilename.match(extfilename):
|
||||||
|
raise CrawlError(
|
||||||
|
extid, '{} is not a valid extension file name, skipping...'.format(
|
||||||
|
extfilename))
|
||||||
|
|
||||||
|
|
||||||
|
def update_crx(dir, verbose, ext_id):
|
||||||
|
last_crx_file = last_crx(dir, ext_id)
|
||||||
|
last_crx_http_date = last_modified_http_date(last_crx_file)
|
||||||
|
log(verbose, " * crx archive (Last: {}): ".format(
|
||||||
|
valueOf(last_crx_http_date, "n/a")))
|
||||||
|
headers = ""
|
||||||
|
if last_crx_file is not "":
|
||||||
|
headers = {'If-Modified-Since': last_crx_http_date}
|
||||||
|
try:
|
||||||
|
res = requests.get(
|
||||||
|
const_download_url().format(ext_id),
|
||||||
|
stream=True,
|
||||||
|
headers=headers)
|
||||||
|
log(verbose, "{}".format(str(res.status_code)))
|
||||||
|
extfilename = os.path.basename(res.url)
|
||||||
|
store_request_metadata(dir, extfilename, res)
|
||||||
|
|
||||||
|
if res.status_code == 304:
|
||||||
|
write_text(dir, extfilename + ".link",
|
||||||
|
os.path.join("..",
|
||||||
|
last_modified_utc_date(last_crx_file),
|
||||||
|
extfilename) + "\n")
|
||||||
|
else:
|
||||||
|
validate_crx_response(res, extfilename)
|
||||||
|
with open(os.path.join(dir, extfilename), 'wb') as f:
|
||||||
|
for chunk in res.iter_content(chunk_size=512 * 1024):
|
||||||
|
if chunk: # filter out keep-alive new chunks
|
||||||
|
f.write(chunk)
|
||||||
|
except Exception as e:
|
||||||
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
||||||
|
write_text(dir, extfilename + ".exception", str(e))
|
||||||
|
return RequestResult(res, e)
|
||||||
|
log(verbose, "\n")
|
||||||
|
return RequestResult(res)
|
||||||
|
|
||||||
|
|
||||||
|
def update_reviews(dir, verbose, ext_id):
|
||||||
|
log(verbose, " * review page: ")
|
||||||
|
res = None
|
||||||
|
try:
|
||||||
|
google_dos_protection()
|
||||||
|
res = requests.post(
|
||||||
|
const_review_url(),
|
||||||
|
data=const_review_payload(ext_id, "0",
|
||||||
|
"100"))
|
||||||
|
log(verbose, "{}/".format(str(res.status_code)))
|
||||||
|
store_request_text(dir, 'reviews000-099.text', res)
|
||||||
|
google_dos_protection()
|
||||||
|
res = requests.post(
|
||||||
|
const_review_url(),
|
||||||
|
data=const_review_payload(ext_id, "0",
|
||||||
|
"100"))
|
||||||
|
log(verbose, "{}".format(str(res.status_code)))
|
||||||
|
store_request_text(dir, 'reviews100-199.text', res)
|
||||||
|
except Exception as e:
|
||||||
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
||||||
|
write_text(dir, 'reviews.html.exception', str(e))
|
||||||
|
return RequestResult(res, e)
|
||||||
|
log(verbose, "\n")
|
||||||
|
return RequestResult(res)
|
||||||
|
|
||||||
|
|
||||||
|
def update_support(dir, verbose, ext_id):
|
||||||
|
log(verbose, " * support page: ")
|
||||||
|
res = None
|
||||||
|
try:
|
||||||
|
google_dos_protection()
|
||||||
|
res = requests.post(
|
||||||
|
const_support_url(),
|
||||||
|
data=const_support_payload(ext_id, "0",
|
||||||
|
"100"))
|
||||||
|
log(verbose, "{}/".format(str(res.status_code)))
|
||||||
|
store_request_text(dir, 'support000-099.text', res)
|
||||||
|
google_dos_protection()
|
||||||
|
res = requests.post(
|
||||||
|
const_support_url(),
|
||||||
|
data=const_support_payload(ext_id, "100",
|
||||||
|
"100"))
|
||||||
|
log(verbose, "{}".format(str(res.status_code)))
|
||||||
|
store_request_text(dir, 'support100-199.text', res)
|
||||||
|
except Exception as e:
|
||||||
|
log(verbose, " / Exception: {}\n".format(str(e)))
|
||||||
|
write_text(dir, 'support.html.exception', str(e))
|
||||||
|
return RequestResult(res, e)
|
||||||
|
log(verbose, "\n")
|
||||||
|
return RequestResult(res)
|
||||||
|
|
||||||
|
|
||||||
|
def update_extension(archivedir, verbose, forums, ext_id):
|
||||||
|
log(verbose, " Updating {}".format(ext_id))
|
||||||
|
if forums:
|
||||||
|
log(verbose, " (including forums)")
|
||||||
|
log(verbose, "\n")
|
||||||
|
date = datetime.now(timezone.utc).isoformat()
|
||||||
|
dir = os.path.join(
|
||||||
|
os.path.join(archivedir,
|
||||||
|
get_local_archive_dir(ext_id)),
|
||||||
|
date)
|
||||||
|
os.makedirs(dir, exist_ok=True)
|
||||||
|
res_overview = update_overview(dir, verbose, ext_id)
|
||||||
|
res_crx = update_crx(dir, verbose, ext_id)
|
||||||
|
res_reviews = None
|
||||||
|
res_support = None
|
||||||
|
if forums:
|
||||||
|
res_reviews = update_reviews(dir, verbose, ext_id)
|
||||||
|
res_support = update_support(dir, verbose, ext_id)
|
||||||
|
return UpdateResult(ext_id, res_overview, res_crx, res_reviews,
|
||||||
|
res_support)
|
||||||
|
|
||||||
|
|
||||||
|
def update_extensions(archivedir, verbose, forums_ext_ids, known_ext_ids,
|
||||||
|
new_ext_ids):
|
||||||
|
def update_forums(ext_id):
|
||||||
|
return (ext_id in forums_ext_ids)
|
||||||
|
|
||||||
|
ext_ids = known_ext_ids + new_ext_ids
|
||||||
|
log(verbose,
|
||||||
|
"Updating {} extensions ({} new, {} including forums)\n".format(
|
||||||
|
len(ext_ids), len(new_ext_ids), len(forums_ext_ids)))
|
||||||
|
return list(map(lambda ext_id: update_extension(archivedir, verbose,
|
||||||
|
update_forums(ext_id), ext_id),
|
||||||
|
ext_ids))
|
||||||
|
|
||||||
|
|
||||||
|
def get_existing_ids(archivedir, verbose):
|
||||||
|
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
|
||||||
|
word = byte + byte + byte + byte
|
||||||
|
return list(
|
||||||
|
map(lambda d: re.sub("^.*\/", "", d),
|
||||||
|
glob.glob(os.path.join(archivedir, "*", word))))
|
||||||
|
|
||||||
|
|
||||||
|
def get_forum_ext_ids(confdir, verbose):
|
||||||
|
with open(os.path.join(confdir, "forums.conf")) as f:
|
||||||
|
ids = f.readlines()
|
||||||
|
ids = [x.strip() for x in ids]
|
||||||
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,8 @@ import requests
|
||||||
import re
|
import re
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
import ExtensionCrawler.config
|
import ExtensionCrawler.config
|
||||||
|
from ExtensionCrawler.util import *
|
||||||
|
|
||||||
|
|
||||||
def crawl_nearly_all_of_ext_ids():
|
def crawl_nearly_all_of_ext_ids():
|
||||||
def get_inner_elems(doc):
|
def get_inner_elems(doc):
|
||||||
|
@ -45,3 +47,12 @@ def crawl_nearly_all_of_ext_ids():
|
||||||
lambda x, y: x + y,
|
lambda x, y: x + y,
|
||||||
map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
|
map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
|
||||||
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
|
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
|
||||||
|
|
||||||
|
def get_new_ids(verbose, known_ids):
|
||||||
|
log(verbose, "Discovering new ids ... \n")
|
||||||
|
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
|
||||||
|
new_ids = list(set(discovered_ids) - set(known_ids))
|
||||||
|
log(verbose, " Discovered {} new extensions (out of {})\n".format(
|
||||||
|
len(new_ids), len(discovered_ids)))
|
||||||
|
return new_ids
|
||||||
|
|
||||||
|
|
183
crawler
183
crawler
|
@ -27,190 +27,11 @@ from datetime import datetime, timezone
|
||||||
from ExtensionCrawler.discover import *
|
from ExtensionCrawler.discover import *
|
||||||
from ExtensionCrawler.archive import *
|
from ExtensionCrawler.archive import *
|
||||||
from ExtensionCrawler.util import *
|
from ExtensionCrawler.util import *
|
||||||
|
from ExtensionCrawler.discover import *
|
||||||
import dateutil
|
import dateutil
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_overview(dir, verbose, ext_id):
|
|
||||||
log(verbose, " * overview page: ")
|
|
||||||
try:
|
|
||||||
res = requests.get(ExtensionCrawler.config.const_overview_url(ext_id))
|
|
||||||
log(verbose, "{}".format(str(res.status_code)))
|
|
||||||
store_request_text(dir, 'overview.html', res)
|
|
||||||
except Exception as e:
|
|
||||||
log(verbose, " / Exception: {}\n".format(str(e)))
|
|
||||||
write_text(dir, 'overview.html.exception', str(e))
|
|
||||||
return RequestResult(res, e)
|
|
||||||
log(verbose, "\n")
|
|
||||||
return RequestResult(res)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_crx_response(res, extfilename):
|
|
||||||
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
|
||||||
if not 'Content-Type' in res.headers:
|
|
||||||
raise CrawlError(extid, 'Did not find Content-Type header.',
|
|
||||||
'\n'.join(res.iter_lines()))
|
|
||||||
if not res.headers['Content-Type'] == 'application/x-chrome-extension':
|
|
||||||
text = [line.decode('utf-8') for line in res.iter_lines()]
|
|
||||||
raise CrawlError(
|
|
||||||
extid,
|
|
||||||
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
|
|
||||||
format(res.headers['Content-Type']), '\n'.join(text))
|
|
||||||
if not regex_extfilename.match(extfilename):
|
|
||||||
raise CrawlError(
|
|
||||||
extid, '{} is not a valid extension file name, skipping...'.format(
|
|
||||||
extfilename))
|
|
||||||
|
|
||||||
|
|
||||||
def update_crx(dir, verbose, ext_id):
|
|
||||||
last_crx_file = last_crx(dir, ext_id)
|
|
||||||
last_crx_http_date = last_modified_http_date(last_crx_file)
|
|
||||||
log(verbose, " * crx archive (Last: {}): ".format(
|
|
||||||
valueOf(last_crx_http_date, "n/a")))
|
|
||||||
headers = ""
|
|
||||||
if last_crx_file is not "":
|
|
||||||
headers = {'If-Modified-Since': last_crx_http_date}
|
|
||||||
try:
|
|
||||||
res = requests.get(
|
|
||||||
ExtensionCrawler.config.const_download_url().format(ext_id),
|
|
||||||
stream=True,
|
|
||||||
headers=headers)
|
|
||||||
log(verbose, "{}".format(str(res.status_code)))
|
|
||||||
extfilename = os.path.basename(res.url)
|
|
||||||
store_request_metadata(dir, extfilename, res)
|
|
||||||
|
|
||||||
if res.status_code == 304:
|
|
||||||
write_text(dir, extfilename + ".link",
|
|
||||||
os.path.join("..",
|
|
||||||
last_modified_utc_date(last_crx_file),
|
|
||||||
extfilename) + "\n")
|
|
||||||
else:
|
|
||||||
validate_crx_response(res, extfilename)
|
|
||||||
with open(os.path.join(dir, extfilename), 'wb') as f:
|
|
||||||
for chunk in res.iter_content(chunk_size=512 * 1024):
|
|
||||||
if chunk: # filter out keep-alive new chunks
|
|
||||||
f.write(chunk)
|
|
||||||
except Exception as e:
|
|
||||||
log(verbose, " / Exception: {}\n".format(str(e)))
|
|
||||||
write_text(dir, extfilename + ".exception", str(e))
|
|
||||||
return RequestResult(res, e)
|
|
||||||
log(verbose, "\n")
|
|
||||||
return RequestResult(res)
|
|
||||||
|
|
||||||
|
|
||||||
def update_reviews(dir, verbose, ext_id):
|
|
||||||
log(verbose, " * review page: ")
|
|
||||||
res = None
|
|
||||||
try:
|
|
||||||
google_dos_protection()
|
|
||||||
res = requests.post(
|
|
||||||
ExtensionCrawler.config.const_review_url(),
|
|
||||||
data=ExtensionCrawler.config.const_review_payload(ext_id, "0",
|
|
||||||
"100"))
|
|
||||||
log(verbose, "{}/".format(str(res.status_code)))
|
|
||||||
store_request_text(dir, 'reviews000-099.text', res)
|
|
||||||
google_dos_protection()
|
|
||||||
res = requests.post(
|
|
||||||
ExtensionCrawler.config.const_review_url(),
|
|
||||||
data=ExtensionCrawler.config.const_review_payload(ext_id, "0",
|
|
||||||
"100"))
|
|
||||||
log(verbose, "{}".format(str(res.status_code)))
|
|
||||||
store_request_text(dir, 'reviews100-199.text', res)
|
|
||||||
except Exception as e:
|
|
||||||
log(verbose, " / Exception: {}\n".format(str(e)))
|
|
||||||
write_text(dir, 'reviews.html.exception', str(e))
|
|
||||||
return RequestResult(res, e)
|
|
||||||
log(verbose, "\n")
|
|
||||||
return RequestResult(res)
|
|
||||||
|
|
||||||
|
|
||||||
def update_support(dir, verbose, ext_id):
|
|
||||||
log(verbose, " * support page: ")
|
|
||||||
res = None
|
|
||||||
try:
|
|
||||||
google_dos_protection()
|
|
||||||
res = requests.post(
|
|
||||||
ExtensionCrawler.config.const_support_url(),
|
|
||||||
data=ExtensionCrawler.config.const_support_payload(ext_id, "0",
|
|
||||||
"100"))
|
|
||||||
log(verbose, "{}/".format(str(res.status_code)))
|
|
||||||
store_request_text(dir, 'support000-099.text', res)
|
|
||||||
google_dos_protection()
|
|
||||||
res = requests.post(
|
|
||||||
ExtensionCrawler.config.const_support_url(),
|
|
||||||
data=ExtensionCrawler.config.const_support_payload(ext_id, "100",
|
|
||||||
"100"))
|
|
||||||
log(verbose, "{}".format(str(res.status_code)))
|
|
||||||
store_request_text(dir, 'support100-199.text', res)
|
|
||||||
except Exception as e:
|
|
||||||
log(verbose, " / Exception: {}\n".format(str(e)))
|
|
||||||
write_text(dir, 'support.html.exception', str(e))
|
|
||||||
return RequestResult(res, e)
|
|
||||||
log(verbose, "\n")
|
|
||||||
return RequestResult(res)
|
|
||||||
|
|
||||||
|
|
||||||
def update_extension(archivedir, verbose, forums, ext_id):
|
|
||||||
log(verbose, " Updating {}".format(ext_id))
|
|
||||||
if forums:
|
|
||||||
log(verbose, " (including forums)")
|
|
||||||
log(verbose, "\n")
|
|
||||||
date = datetime.now(timezone.utc).isoformat()
|
|
||||||
dir = os.path.join(
|
|
||||||
os.path.join(archivedir,
|
|
||||||
ExtensionCrawler.archive.get_local_archive_dir(ext_id)),
|
|
||||||
date)
|
|
||||||
os.makedirs(dir, exist_ok=True)
|
|
||||||
res_overview = update_overview(dir, verbose, ext_id)
|
|
||||||
res_crx = update_crx(dir, verbose, ext_id)
|
|
||||||
res_reviews = None
|
|
||||||
res_support = None
|
|
||||||
if forums:
|
|
||||||
res_reviews = update_reviews(dir, verbose, ext_id)
|
|
||||||
res_support = update_support(dir, verbose, ext_id)
|
|
||||||
return UpdateResult(ext_id, res_overview, res_crx, res_reviews,
|
|
||||||
res_support)
|
|
||||||
|
|
||||||
|
|
||||||
def update_extensions(archivedir, verbose, forums_ext_ids, known_ext_ids,
|
|
||||||
new_ext_ids):
|
|
||||||
def update_forums(ext_id):
|
|
||||||
return (ext_id in forums_ext_ids)
|
|
||||||
|
|
||||||
ext_ids = known_ext_ids + new_ext_ids
|
|
||||||
log(verbose,
|
|
||||||
"Updating {} extensions ({} new, {} including forums)\n".format(
|
|
||||||
len(ext_ids), len(new_ext_ids), len(forums_ext_ids)))
|
|
||||||
return list(map(lambda ext_id: update_extension(archivedir, verbose,
|
|
||||||
update_forums(ext_id), ext_id),
|
|
||||||
ext_ids))
|
|
||||||
|
|
||||||
|
|
||||||
def get_existing_ids(archivedir, verbose):
|
|
||||||
byte = '[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
|
|
||||||
word = byte + byte + byte + byte
|
|
||||||
return list(
|
|
||||||
map(lambda d: re.sub("^.*\/", "", d),
|
|
||||||
glob.glob(os.path.join(archivedir, "*", word))))
|
|
||||||
|
|
||||||
|
|
||||||
def get_forum_ext_ids(confdir, verbose):
|
|
||||||
with open(os.path.join(confdir, "forums.conf")) as f:
|
|
||||||
ids = f.readlines()
|
|
||||||
ids = [x.strip() for x in ids]
|
|
||||||
return ids
|
|
||||||
|
|
||||||
|
|
||||||
def get_new_ids(verbose, known_ids):
|
|
||||||
log(verbose, "Discovering new ids ... \n")
|
|
||||||
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
|
|
||||||
new_ids = list(set(discovered_ids) - set(known_ids))
|
|
||||||
log(verbose, " Discovered {} new extensions (out of {})\n".format(
|
|
||||||
len(new_ids), len(discovered_ids)))
|
|
||||||
return new_ids
|
|
||||||
|
|
||||||
|
|
||||||
def log_summary(verbose, res):
|
def log_summary(verbose, res):
|
||||||
total = len(res)
|
total = len(res)
|
||||||
success = len(list(filter(lambda x: x.is_ok(), res)))
|
success = len(list(filter(lambda x: x.is_ok(), res)))
|
||||||
|
@ -225,7 +46,7 @@ def log_summary(verbose, res):
|
||||||
str(success), str(total)))
|
str(success), str(total)))
|
||||||
log(verbose, " Not authorized: {}\n".format(str(not_authorized)))
|
log(verbose, " Not authorized: {}\n".format(str(not_authorized)))
|
||||||
log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos)))
|
log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos)))
|
||||||
log(verbose, " Updated archives: {}\n".format(str(success-not_modified)))
|
log(verbose, " Not modified archives: {}\n".format(str(not_modified)))
|
||||||
log(verbose, " Extensions not in store: {}\n".format(str(not_in_store)))
|
log(verbose, " Extensions not in store: {}\n".format(str(not_in_store)))
|
||||||
log(verbose, " Unknown exception: {}\n".format(str(has_exception)))
|
log(verbose, " Unknown exception: {}\n".format(str(has_exception)))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue