Improved discovery script.

This commit is contained in:
Michael Herzberg 2017-01-17 16:08:20 +00:00
parent 5146f1808f
commit dbaa55286e
1 changed files with 67 additions and 43 deletions

View File

@ -1,4 +1,20 @@
#!/bin/env python3 #!/bin/env python3
#
# Copyright (C) 2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from selenium import webdriver from selenium import webdriver
import time import time
import os import os
@ -6,57 +22,65 @@ import re
import time import time
import sys import sys
import datetime import datetime
import argparse
def scroll(driver): class ExtensionExplorer:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") def scroll(self, driver):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
def click_more(driver): def click_more(self, driver):
more_btn = driver.find_element_by_xpath("//div[text()='See More']") more_btn = driver.find_element_by_xpath("//div[text()='See More']")
if more_btn and more_btn.is_displayed(): if more_btn and more_btn.is_displayed():
more_btn.click() more_btn.click()
#print("More Button found, seen and clicked")
def save_ids(driver, savefile): def save_ids(self, driver, savefile):
#This does not give is the complete source, unfortunately: #This does not give us the complete source, unfortunately:
#content = driver.page_source #content = driver.page_source
content = driver.execute_script("return document.body.innerHTML") content = driver.execute_script('return document.body.innerHTML')
ids = sorted(set(re.findall("""[a-z]{32}""", content))) ids = sorted(set(re.findall('\/([a-z]{32})"', content)))
oldids = [] oldids = []
try: try:
with open(savefile, "r") as f: with open(savefile, 'r') as f:
for line in f: for line in f:
oldids.append(line.strip()) oldids.append(line.strip())
oldids = sorted(set(oldids)) oldids = sorted(set(oldids))
except: except:
pass pass
if ids == oldids: if ids == oldids:
return False return False
with open(savefile, "w") as f: with open(savefile, 'w') as f:
f.write("\n".join(ids)) f.write('\n'.join(ids))
#print("IDs written") return True
return True def run(self, outdir, interval):
os.makedirs(outdir, exist_ok=True)
savefile = os.path.join(outdir, 'ids-{}.txt'.format(datetime.datetime.now().isoformat()))
savedir = "." driver = webdriver.PhantomJS(service_log_path=os.path.devnull)
if len(sys.argv) > 1: content = driver.get('https://chrome.google.com/webstore/category/extensions')
savedir = sys.argv[1]
os.makedirs(savedir, exist_ok=True)
savefile = os.path.join(savedir, "ids-{}.txt".format(datetime.datetime.now().isoformat()))
driver = webdriver.PhantomJS() last_save = 0
content = driver.get('https://chrome.google.com/webstore/category/extensions') while True:
try:
self.scroll(driver)
self.click_more(driver)
except Exception as e:
print(e, file=sys.stderr)
last_save = 0 if time.time() - last_save > interval:
while True: if not self.save_ids(driver, savefile):
scroll(driver) driver.quit()
click_more(driver) sys.exit(0)
last_save = time.time()
if time.time() - last_save > 30.0: time.sleep(0.5)
if not save_ids(driver, savefile):
#print("No new extension ids since last save, exiting...")
sys.exit(0)
last_save = time.time()
time.sleep(0.5) if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Crawls the Google Play Store for new extension ids.')
parser.add_argument('out', help='The directory where the files with new ids should be stored.')
parser.add_argument('-t', '--interval', help='Saves the found ids to file every X seconds. If no new ids have been found within these X seconds, the crawler quits.', default=30.0, type=float)
args = parser.parse_args()
ExtensionExplorer().run(args.out, args.interval)