Reformatting.
This commit is contained in:
parent
3ed43f036d
commit
3cdeba20b4
|
@ -76,8 +76,9 @@ class UpdateResult:
|
|||
self.res_support = res_support
|
||||
|
||||
def is_ok(self):
|
||||
return (self.res_overview.is_ok() and (self.res_crx.is_ok() or self.res_crx.not_modified()) and (
|
||||
(self.res_reviews is None) or self.res_reviews.is_ok()) and (
|
||||
return (self.res_overview.is_ok() and
|
||||
(self.res_crx.is_ok() or self.res_crx.not_modified()) and
|
||||
((self.res_reviews is None) or self.res_reviews.is_ok()) and (
|
||||
(self.res_support is None) or self.res_support.is_ok()))
|
||||
|
||||
def not_authorized(self):
|
||||
|
@ -112,13 +113,14 @@ class UpdateResult:
|
|||
return self.res_crx.not_modified()
|
||||
|
||||
|
||||
|
||||
def get_local_archive_dir(id):
|
||||
return "{}/{}".format(id[:3], id)
|
||||
|
||||
|
||||
def get_local_archive_dirs(id):
|
||||
return [get_local_archive_dir(id)]
|
||||
|
||||
|
||||
def write_text(dir, fname, text):
|
||||
with open(os.path.join(dir, fname), 'w') as f:
|
||||
f.write(text)
|
||||
|
@ -134,6 +136,7 @@ def store_request_text(dir, fname, request):
|
|||
write_text(dir, fname, request.text)
|
||||
store_request_metadata(dir, fname, request)
|
||||
|
||||
|
||||
def httpdate(dt):
|
||||
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
|
||||
month = [
|
||||
|
@ -154,6 +157,8 @@ def last_modified_http_date(path):
|
|||
if path is "":
|
||||
return ""
|
||||
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
|
||||
|
||||
|
||||
def last_crx(dir, extid):
|
||||
old_archives = sorted(
|
||||
glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
|
||||
|
@ -163,10 +168,6 @@ def last_crx(dir, extid):
|
|||
return last_archive
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def update_overview(dir, verbose, ext_id):
|
||||
log(verbose, " * overview page: ")
|
||||
try:
|
||||
|
@ -207,8 +208,7 @@ def update_crx(dir, verbose, ext_id):
|
|||
if last_crx_file is not "":
|
||||
headers = {'If-Modified-Since': last_crx_http_date}
|
||||
try:
|
||||
res = requests.get(
|
||||
const_download_url().format(ext_id),
|
||||
res = requests.get(const_download_url().format(ext_id),
|
||||
stream=True,
|
||||
headers=headers)
|
||||
log(verbose, "{}".format(str(res.status_code)))
|
||||
|
@ -240,16 +240,12 @@ def update_reviews(dir, verbose, ext_id):
|
|||
try:
|
||||
google_dos_protection()
|
||||
res = requests.post(
|
||||
const_review_url(),
|
||||
data=const_review_payload(ext_id, "0",
|
||||
"100"))
|
||||
const_review_url(), data=const_review_payload(ext_id, "0", "100"))
|
||||
log(verbose, "{}/".format(str(res.status_code)))
|
||||
store_request_text(dir, 'reviews000-099.text', res)
|
||||
google_dos_protection()
|
||||
res = requests.post(
|
||||
const_review_url(),
|
||||
data=const_review_payload(ext_id, "0",
|
||||
"100"))
|
||||
const_review_url(), data=const_review_payload(ext_id, "0", "100"))
|
||||
log(verbose, "{}".format(str(res.status_code)))
|
||||
store_request_text(dir, 'reviews100-199.text', res)
|
||||
except Exception as e:
|
||||
|
@ -267,15 +263,13 @@ def update_support(dir, verbose, ext_id):
|
|||
google_dos_protection()
|
||||
res = requests.post(
|
||||
const_support_url(),
|
||||
data=const_support_payload(ext_id, "0",
|
||||
"100"))
|
||||
data=const_support_payload(ext_id, "0", "100"))
|
||||
log(verbose, "{}/".format(str(res.status_code)))
|
||||
store_request_text(dir, 'support000-099.text', res)
|
||||
google_dos_protection()
|
||||
res = requests.post(
|
||||
const_support_url(),
|
||||
data=const_support_payload(ext_id, "100",
|
||||
"100"))
|
||||
data=const_support_payload(ext_id, "100", "100"))
|
||||
log(verbose, "{}".format(str(res.status_code)))
|
||||
store_request_text(dir, 'support100-199.text', res)
|
||||
except Exception as e:
|
||||
|
@ -293,9 +287,7 @@ def update_extension(archivedir, verbose, forums, ext_id):
|
|||
log(verbose, "\n")
|
||||
date = datetime.now(timezone.utc).isoformat()
|
||||
dir = os.path.join(
|
||||
os.path.join(archivedir,
|
||||
get_local_archive_dir(ext_id)),
|
||||
date)
|
||||
os.path.join(archivedir, get_local_archive_dir(ext_id)), date)
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
res_overview = update_overview(dir, verbose, ext_id)
|
||||
res_crx = update_crx(dir, verbose, ext_id)
|
||||
|
@ -335,6 +327,3 @@ def get_forum_ext_ids(confdir, verbose):
|
|||
ids = f.readlines()
|
||||
ids = [x.strip() for x in ids]
|
||||
return ids
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -48,6 +48,7 @@ def crawl_nearly_all_of_ext_ids():
|
|||
map(lambda s: [elem.text for elem in get_inner_elems(s)], shards), [])
|
||||
return [re.search("[a-z]{32}", url).group(0) for url in overview_urls]
|
||||
|
||||
|
||||
def get_new_ids(verbose, known_ids):
|
||||
log(verbose, "Discovering new ids ... \n")
|
||||
discovered_ids = ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
|
||||
|
@ -55,4 +56,3 @@ def get_new_ids(verbose, known_ids):
|
|||
log(verbose, " Discovered {} new extensions (out of {})\n".format(
|
||||
len(new_ids), len(discovered_ids)))
|
||||
return new_ids
|
||||
|
||||
|
|
|
@ -16,19 +16,21 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
|
||||
import sys
|
||||
from time import sleep
|
||||
from random import randint
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def google_dos_protection(max=3):
|
||||
sleep(randint(1, max) * .5)
|
||||
|
||||
|
||||
def log(verbose, msg):
|
||||
if verbose:
|
||||
sys.stdout.write(msg)
|
||||
|
||||
|
||||
def valueOf(value, default):
|
||||
if value is not None and value is not "":
|
||||
return value
|
||||
|
|
6
crawler
6
crawler
|
@ -44,11 +44,13 @@ def log_summary(verbose, res):
|
|||
log(verbose, "Summary:\n")
|
||||
log(verbose, " Updated {} out of {} extensions successfully\n".format(
|
||||
str(success), str(total)))
|
||||
log(verbose, " Not authorized: {}\n".format(str(not_authorized)))
|
||||
log(verbose,
|
||||
" Not authorized: {}\n".format(str(not_authorized)))
|
||||
log(verbose, " Raised Google DDOS: {}\n".format(str(raised_ddos)))
|
||||
log(verbose, " Not modified archives: {}\n".format(str(not_modified)))
|
||||
log(verbose, " Extensions not in store: {}\n".format(str(not_in_store)))
|
||||
log(verbose, " Unknown exception: {}\n".format(str(has_exception)))
|
||||
log(verbose,
|
||||
" Unknown exception: {}\n".format(str(has_exception)))
|
||||
|
||||
|
||||
def main():
|
||||
|
|
Loading…
Reference in New Issue