Improved discovery script.
This commit is contained in:
parent
5146f1808f
commit
dbaa55286e
|
@ -1,4 +1,20 @@
|
||||||
#!/bin/env python3
|
#!/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright (C) 2017 The University of Sheffield, UK
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
|
@ -6,26 +22,27 @@ import re
|
||||||
import time
|
import time
|
||||||
import sys
|
import sys
|
||||||
import datetime
|
import datetime
|
||||||
|
import argparse
|
||||||
|
|
||||||
def scroll(driver):
|
class ExtensionExplorer:
|
||||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
def scroll(self, driver):
|
||||||
|
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
|
||||||
|
|
||||||
def click_more(driver):
|
def click_more(self, driver):
|
||||||
more_btn = driver.find_element_by_xpath("//div[text()='See More']")
|
more_btn = driver.find_element_by_xpath("//div[text()='See More']")
|
||||||
if more_btn and more_btn.is_displayed():
|
if more_btn and more_btn.is_displayed():
|
||||||
more_btn.click()
|
more_btn.click()
|
||||||
#print("More Button found, seen and clicked")
|
|
||||||
|
|
||||||
def save_ids(driver, savefile):
|
def save_ids(self, driver, savefile):
|
||||||
#This does not give is the complete source, unfortunately:
|
#This does not give us the complete source, unfortunately:
|
||||||
#content = driver.page_source
|
#content = driver.page_source
|
||||||
|
|
||||||
content = driver.execute_script("return document.body.innerHTML")
|
content = driver.execute_script('return document.body.innerHTML')
|
||||||
ids = sorted(set(re.findall("""[a-z]{32}""", content)))
|
ids = sorted(set(re.findall('\/([a-z]{32})"', content)))
|
||||||
|
|
||||||
oldids = []
|
oldids = []
|
||||||
try:
|
try:
|
||||||
with open(savefile, "r") as f:
|
with open(savefile, 'r') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
oldids.append(line.strip())
|
oldids.append(line.strip())
|
||||||
oldids = sorted(set(oldids))
|
oldids = sorted(set(oldids))
|
||||||
|
@ -34,29 +51,36 @@ def save_ids(driver, savefile):
|
||||||
if ids == oldids:
|
if ids == oldids:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
with open(savefile, "w") as f:
|
with open(savefile, 'w') as f:
|
||||||
f.write("\n".join(ids))
|
f.write('\n'.join(ids))
|
||||||
#print("IDs written")
|
|
||||||
return True
|
return True
|
||||||
|
def run(self, outdir, interval):
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
savefile = os.path.join(outdir, 'ids-{}.txt'.format(datetime.datetime.now().isoformat()))
|
||||||
|
|
||||||
savedir = "."
|
driver = webdriver.PhantomJS(service_log_path=os.path.devnull)
|
||||||
if len(sys.argv) > 1:
|
|
||||||
savedir = sys.argv[1]
|
|
||||||
os.makedirs(savedir, exist_ok=True)
|
|
||||||
savefile = os.path.join(savedir, "ids-{}.txt".format(datetime.datetime.now().isoformat()))
|
|
||||||
|
|
||||||
driver = webdriver.PhantomJS()
|
|
||||||
content = driver.get('https://chrome.google.com/webstore/category/extensions')
|
content = driver.get('https://chrome.google.com/webstore/category/extensions')
|
||||||
|
|
||||||
last_save = 0
|
last_save = 0
|
||||||
while True:
|
while True:
|
||||||
scroll(driver)
|
try:
|
||||||
click_more(driver)
|
self.scroll(driver)
|
||||||
|
self.click_more(driver)
|
||||||
|
except Exception as e:
|
||||||
|
print(e, file=sys.stderr)
|
||||||
|
|
||||||
if time.time() - last_save > 30.0:
|
if time.time() - last_save > interval:
|
||||||
if not save_ids(driver, savefile):
|
if not self.save_ids(driver, savefile):
|
||||||
#print("No new extension ids since last save, exiting...")
|
driver.quit()
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
last_save = time.time()
|
last_save = time.time()
|
||||||
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Crawls the Google Play Store for new extension ids.')
|
||||||
|
parser.add_argument('out', help='The directory where the files with new ids should be stored.')
|
||||||
|
parser.add_argument('-t', '--interval', help='Saves the found ids to file every X seconds. If no new ids have been found within these X seconds, the crawler quits.', default=30.0, type=float)
|
||||||
|
args = parser.parse_args()
|
||||||
|
ExtensionExplorer().run(args.out, args.interval)
|
||||||
|
|
Loading…
Reference in New Issue