Re-discover already archived ids.

This commit is contained in:
Achim D. Brucker 2017-01-23 18:54:32 +00:00
parent 3dc7a5d663
commit be36b7f129
1 changed files with 97 additions and 0 deletions

97
crawler Executable file
View File

@ -0,0 +1,97 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sys
import ExtensionCrawler.discover
import ExtensionCrawler.archive
import glob
import re
def update_overview(dir, verbose, ext_id):
if verbose:
sys.stdout.write(" * overview page: ")
#res = requests.get(ExtensionCrawler.config.const_overview_url.format(ext_id))
#if verbose:
# sys.stdout.write("{}\n".format(str(res.status)))
return True
def update_crx(dir, verbose, ext_id):
if verbose:
sys.stdout.write(" * crx archive\n")
return True
def update_reviews(dir, verbose, ext_id):
if verbose:
sys.stdout.write(" * review page\n")
return True
def update_support(dir, verbose, ext_id):
if verbose:
sys.stdout.write(" * support page\n")
return True
def update_extension(basedir, verbose, forums, ext_id):
sys.stdout.write(" Update Extension: "+ext_id+"\n")
if verbose:
sys.stdout.write(" Updating {}".format(ext_id))
if forums:
sys.stdout.write(" (including forums)")
sys.stdout.write("\n")
dir = basedir+"/"+(ExtensionCrawler.archive.get_local_archive_dir(ext_id))
os.makedirs(dir, exist_ok=True)
update_overview(dir, verbose, ext_id)
update_crx(dir, verbose, ext_id)
if forums:
update_reviews(dir, verbose, ext_id)
update_support(dir, verbose, ext_id)
def update_extensions(basedir, verbose, forums_ext_ids, ext_ids):
def update_forums(ext_id):
return (ext_id in forums_ext_ids)
foo = list(map(lambda ext_id: update_extension(basedir, verbose, update_forums(ext_id), ext_id), ext_ids))
return foo
def get_existing_ids(basedir,verbose):
byte='[0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z][0-9a-z]'
word=byte+byte+byte+byte
return list(map(lambda d: re.sub("^.*\/","",d), glob.glob(os.path.join(basedir,"*",word))))
def main():
basedir="./archive"
verbose=True
forums_ext_ids_file=basedir+"/forum_ext_ids.txt"
sys.stdout.write("Crawling ID\n")
discovered_ids = ['adndegnbdnefpaelcbpdchpngejmggbl'] # ExtensionCrawler.discover.crawl_nearly_all_of_ext_ids()
existing_ids = get_existing_ids(basedir,verbose)
new_ids = list(set(discovered_ids)-set(existing_ids))
sys.stdout.write(" Discoverd {} ids ({} of them are new, {} already known)\n".format(str(len(discovered_ids)), str(len(new_ids)),str(len(existing_ids))))
forum_ext_ids = [] # TODO
update_extensions(basedir,verbose,forum_ext_ids, existing_ids + new_ids)
main()