Reformatting.

This commit is contained in:
Achim D. Brucker 2017-01-28 13:15:05 +00:00
parent 3ed43f036d
commit 3cdeba20b4
4 changed files with 31 additions and 38 deletions

57
ExtensionCrawler/archive.py Executable file → Normal file
View File

@ -49,22 +49,22 @@ class RequestResult:
self.exception = exception
def is_ok(self):
return (self.exception is None) and (self.http_status==200)
return (self.exception is None) and (self.http_status == 200)
def not_authorized(self):
return (self.exception is None) and (self.http_status==401)
return (self.exception is None) and (self.http_status == 401)
def not_found(self):
return (self.exception is None) and (self.http_status==404)
return (self.exception is None) and (self.http_status == 404)
def has_exception(self):
return self.exception is not None
def not_available(self):
return (self.exception is None) and (self.http_status==503)
return (self.exception is None) and (self.http_status == 503)
def not_modified(self):
return ((self.exception is None) and (self.http_status==304))
return ((self.exception is None) and (self.http_status == 304))
class UpdateResult:
@ -76,9 +76,10 @@ class UpdateResult:
self.res_support = res_support
def is_ok(self):
return (self.res_overview.is_ok() and (self.res_crx.is_ok() or self.res_crx.not_modified()) and (
(self.res_reviews is None) or self.res_reviews.is_ok()) and (
(self.res_support is None) or self.res_support.is_ok()))
return (self.res_overview.is_ok() and
(self.res_crx.is_ok() or self.res_crx.not_modified()) and
((self.res_reviews is None) or self.res_reviews.is_ok()) and (
(self.res_support is None) or self.res_support.is_ok()))
def not_authorized(self):
return (self.res_overview.not_authorized() or
@ -112,13 +113,14 @@ class UpdateResult:
return self.res_crx.not_modified()
def get_local_archive_dir(id):
return "{}/{}".format(id[:3],id)
return "{}/{}".format(id[:3], id)
def get_local_archive_dirs(id):
return [get_local_archive_dir(id)]
def write_text(dir, fname, text):
with open(os.path.join(dir, fname), 'w') as f:
f.write(text)
@ -134,6 +136,7 @@ def store_request_text(dir, fname, request):
write_text(dir, fname, request.text)
store_request_metadata(dir, fname, request)
def httpdate(dt):
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
month = [
@ -154,6 +157,8 @@ def last_modified_http_date(path):
if path is "":
return ""
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
def last_crx(dir, extid):
old_archives = sorted(
glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
@ -163,10 +168,6 @@ def last_crx(dir, extid):
return last_archive
def update_overview(dir, verbose, ext_id):
log(verbose, " * overview page: ")
try:
@ -207,10 +208,9 @@ def update_crx(dir, verbose, ext_id):
if last_crx_file is not "":
headers = {'If-Modified-Since': last_crx_http_date}
try:
res = requests.get(
const_download_url().format(ext_id),
stream=True,
headers=headers)
res = requests.get(const_download_url().format(ext_id),
stream=True,
headers=headers)
log(verbose, "{}".format(str(res.status_code)))
extfilename = os.path.basename(res.url)
store_request_metadata(dir, extfilename, res)
@ -240,16 +240,12 @@ def update_reviews(dir, verbose, ext_id):
try:
google_dos_protection()
res = requests.post(
const_review_url(),
data=const_review_payload(ext_id, "0",
"100"))
const_review_url(), data=const_review_payload(ext_id, "0", "100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'reviews000-099.text', res)
google_dos_protection()
res = requests.post(
const_review_url(),
data=const_review_payload(ext_id, "0",
"100"))
const_review_url(), data=const_review_payload(ext_id, "0", "100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'reviews100-199.text', res)
except Exception as e:
@ -267,15 +263,13 @@ def update_support(dir, verbose, ext_id):
google_dos_protection()
res = requests.post(
const_support_url(),
data=const_support_payload(ext_id, "0",
"100"))
data=const_support_payload(ext_id, "0", "100"))
log(verbose, "{}/".format(str(res.status_code)))
store_request_text(dir, 'support000-099.text', res)
google_dos_protection()
res = requests.post(
const_support_url(),
data=const_support_payload(ext_id, "100",
"100"))
data=const_support_payload(ext_id, "100", "100"))
log(verbose, "{}".format(str(res.status_code)))
store_request_text(dir, 'support100-199.text', res)
except Exception as e:
@ -293,9 +287,7 @@ def update_extension(archivedir, verbose, forums, ext_id):
log(verbose, "\n")
date = datetime.now(timezone.utc).isoformat()
dir = os.path.join(
os.path.join(archivedir,
get_local_archive_dir(ext_id)),
date)
os.path.join(archivedir, get_local_archive_dir(ext_id)), date)
os.makedirs(dir, exist_ok=True)
res_overview = update_overview(dir, verbose, ext_id)
res_crx = update_crx(dir, verbose, ext_id)
@ -335,6 +327,3 @@ def get_forum_ext_ids(confdir, verbose):
ids = f.readlines()
ids = [x.strip() for x in ids]
return ids

View File

@ -48,6 +48,7 @@ def crawl_nearly_all_of_ext_ids():
map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
def get_new_ids(verbose, known_ids):
log(verbose, "Discovering new ids ... \n")
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
@ -55,4 +56,3 @@ def get_new_ids(verbose, known_ids):
log(verbose, " Discovered {} new extensions (out of {})\n".format(
len(new_ids), len(discovered_ids)))
return new_ids

View File

@ -16,19 +16,21 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import sys
from time import sleep
from random import randint
from datetime import datetime, timezone
def google_dos_protection(max=3):
sleep(randint(1, max) * .5)
def log(verbose, msg):
if verbose:
sys.stdout.write(msg)
def valueOf(value, default):
if value is not None and value is not "":
return value

View File

@ -44,11 +44,13 @@ def log_summary(verbose, res):
log(verbose, "Summary:\n")
log(verbose, " Updated {} out of {} extensions successfully\n".format(
str(success), str(total)))
log(verbose, " Not authorized: {}\n".format(str(not_authorized)))
log(verbose,
" Not authorized: {}\n".format(str(not_authorized)))
log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos)))
log(verbose, " Not modified archives: {}\n".format(str(not_modified)))
log(verbose, " Extensions not in store: {}\n".format(str(not_in_store)))
log(verbose, " Unknown exception: {}\n".format(str(has_exception)))
log(verbose,
" Unknown exception: {}\n".format(str(has_exception)))
def main():