Re-formatting.
This commit is contained in:
parent
a67aaf0501
commit
e096b9a2d5
33
crawler.py
33
crawler.py
|
@ -96,7 +96,7 @@ class ExtensionCrawler:
|
|||
with open(name + ".url", 'w') as f:
|
||||
f.write(str(request.url))
|
||||
|
||||
def google_dos_protection(self, name, request,max=3):
|
||||
def google_dos_protection(self, name, request, max=3):
|
||||
if max >= 1:
|
||||
sleep(randint(1, max) * .5)
|
||||
|
||||
|
@ -145,8 +145,7 @@ class ExtensionCrawler:
|
|||
raise CrawlError(
|
||||
extid,
|
||||
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
|
||||
format(extresult.headers['Content-Type']),
|
||||
'\n'.join(text))
|
||||
format(extresult.headers['Content-Type']), '\n'.join(text))
|
||||
if not self.regex_extfilename.match(extfilename):
|
||||
raise CrawlError(
|
||||
extid,
|
||||
|
@ -163,7 +162,7 @@ class ExtensionCrawler:
|
|||
self.store_request_metadata(
|
||||
os.path.join(extdir, 'storepage.html'), extpageresult)
|
||||
self.google_dos_protection(
|
||||
os.path.join(extdir, 'storepage.html'), extpageresult,0.1)
|
||||
os.path.join(extdir, 'storepage.html'), extpageresult, 0.1)
|
||||
with open(os.path.join(extdir, 'storepage.html'), 'w') as f:
|
||||
f.write(extpageresult.text)
|
||||
|
||||
|
@ -234,7 +233,12 @@ class ExtensionCrawler:
|
|||
os.path.dirname(os.path.relpath(path, self.basedir)))[1]
|
||||
return self.httpdate(dateutil.parser.parse(utc))
|
||||
|
||||
def update_extension(self, extid, overwrite, extinfo=None,cnt=None,max_cnt=None):
|
||||
def update_extension(self,
|
||||
extid,
|
||||
overwrite,
|
||||
extinfo=None,
|
||||
cnt=None,
|
||||
max_cnt=None):
|
||||
if not self.regex_extid.match(extid):
|
||||
raise CrawlError(extid,
|
||||
'{} is not a valid extension id.\n'.format(extid))
|
||||
|
@ -319,7 +323,6 @@ class ExtensionCrawler:
|
|||
|
||||
return True
|
||||
|
||||
|
||||
def update_extension_list(self, extensions):
|
||||
n_attempts = 0
|
||||
n_success = 0
|
||||
|
@ -329,12 +332,13 @@ class ExtensionCrawler:
|
|||
for extid in extensions:
|
||||
try:
|
||||
n_attempts += 1
|
||||
self.update_extension(extid, True,None,n_attempts,len(extensions))
|
||||
self.update_extension(extid, True, None, n_attempts,
|
||||
len(extensions))
|
||||
n_success += 1
|
||||
except CrawlError as cerr:
|
||||
retry_extids.append(extid)
|
||||
sys.stdout.write(' Error: {}\n'.format(cerr.message))
|
||||
n_errors +=1
|
||||
n_errors += 1
|
||||
if cerr.pagecontent != "":
|
||||
sys.stderr.write(' Page content was:\n')
|
||||
sys.stderr.write(' {}\n'.format(cerr.pagecontent))
|
||||
|
@ -345,14 +349,15 @@ class ExtensionCrawler:
|
|||
except ConnectionResetError as cerr:
|
||||
retry_extids.append(extid)
|
||||
sys.stdout.write(' Error: {}\n'.format(str(cerr)))
|
||||
n_errors +=1
|
||||
n_errors += 1
|
||||
|
||||
sys.stdout.flush()
|
||||
if self.verbose:
|
||||
print("*** Summary: Updated {} of {} extensions successfully".
|
||||
format(n_success, n_attempts))
|
||||
print("*** Login required: {}".format(n_login_required))
|
||||
print("*** Hit Google DOS protection: {}".format(self.google_dos_count))
|
||||
print("*** Hit Google DOS protection: {}".format(
|
||||
self.google_dos_count))
|
||||
print("*** Other Erros: {}".format(n_errors))
|
||||
sys.stdout.flush()
|
||||
return retry_extids
|
||||
|
@ -465,7 +470,10 @@ if __name__ == '__main__':
|
|||
parser.add_argument(
|
||||
'-v', '--verbose', action='store_true', help='Increase verbosity.')
|
||||
parser.add_argument(
|
||||
'-o', '--overview', action='store_true', help='Only download/update overview page.')
|
||||
'-o',
|
||||
'--overview',
|
||||
action='store_true',
|
||||
help='Only download/update overview page.')
|
||||
parser.add_argument(
|
||||
'-w',
|
||||
'--weak',
|
||||
|
@ -473,7 +481,8 @@ if __name__ == '__main__':
|
|||
help='weak check if crx exists already')
|
||||
|
||||
args = parser.parse_args()
|
||||
crawler = ExtensionCrawler(args.dest, args.verbose, args.weak,args.overview)
|
||||
crawler = ExtensionCrawler(args.dest, args.verbose, args.weak,
|
||||
args.overview)
|
||||
|
||||
if args.discover:
|
||||
if args.interval:
|
||||
|
|
Loading…
Reference in New Issue