From 9c4ba395585a03e049210832f72956ebb2a32529 Mon Sep 17 00:00:00 2001
From: "Achim D. Brucker" <adbrucker@0x5f.org>
Date: Sat, 28 Jan 2017 12:52:18 +0000
Subject: [PATCH] Refactoring.

---
 ExtensionCrawler/archive.py | 80 +++++++++++++++++++++++++++++++++++++
 ExtensionCrawler/util.py    | 76 +++++++++++++++++++++++++++++++++++
 crawler                     | 53 ++----------------------
 3 files changed, 159 insertions(+), 50 deletions(-)
 create mode 100755 ExtensionCrawler/archive.py
 create mode 100644 ExtensionCrawler/util.py

diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py
new file mode 100755
index 0000000..2edc777
--- /dev/null
+++ b/ExtensionCrawler/archive.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2016,2017 The University of Sheffield, UK
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import sys
+import glob
+import re
+import requests
+from time import sleep
+from random import randint
+from datetime import datetime, timezone
+from ExtensionCrawler.util import *
+import dateutil
+import dateutil.parser
+
+
+def get_local_archive_dir(id):
+    return "{}/{}".format(id[:3],id)
+
+def get_local_archive_dirs(id):
+    return [get_local_archive_dir(id)]
+
+def write_text(dir, fname, text):
+    with open(os.path.join(dir, fname), 'w') as f:
+        f.write(text)
+
+
+def store_request_metadata(dir, fname, request):
+    write_text(dir, fname + ".headers", str(request.headers))
+    write_text(dir, fname + ".status", str(request.status_code))
+    write_text(dir, fname + ".url", str(request.url))
+
+
+def store_request_text(dir, fname, request):
+    write_text(dir, fname, request.text)
+    store_request_metadata(dir, fname, request)
+
+def httpdate(dt):
+    weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
+    month = [
+        "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
+        "Nov", "Dec"
+    ][dt.month - 1]
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+        weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
+
+
+def last_modified_utc_date(path):
+    if path is "":
+        return ""
+    return os.path.split(os.path.dirname(path))[1]
+
+
+def last_modified_http_date(path):
+    if path is "":
+        return ""
+    return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
+def last_crx(dir, extid):
+    old_archives = sorted(
+        glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
+    last_archive = ""
+    if old_archives != []:
+        last_archive = old_archives[-1]
+    return last_archive
+
diff --git a/ExtensionCrawler/util.py b/ExtensionCrawler/util.py
new file mode 100644
index 0000000..278dfec
--- /dev/null
+++ b/ExtensionCrawler/util.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2016,2017 The University of Sheffield, UK
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+def const_sitemap_url():
+    return "https://chrome.google.com/webstore/sitemap"
+
+
+def const_sitemap_scheme():
+    return "http://www.sitemaps.org/schemas/sitemap/0.9"
+
+
+def const_overview_url(id):
+    return 'https://chrome.google.com/webstore/detail/{}'.format(id)
+
+
+def const_store_url():
+    return 'https://chrome.google.com/webstore'
+
+
+def const_review_url():
+    return 'https://chrome.google.com/reviews/components'
+
+
+def const_support_url():
+    return 'https://chrome.google.com/reviews/components'
+
+
+def const_download_url():
+    return 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
+
+
+def const_categories():
+    return [
+        'extensions', 'ext/22-accessibility', 'ext/10-blogging',
+        'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
+        'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
+        'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
+        'ext/13-sports'
+    ]
+
+
+def const_support_payload(ext_id, start, end):
+    return (
+        'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
+        '"specs":[{{"type":"CommentThread",' +
+        '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+        + '"groups":"chrome_webstore_support",' + '"startindex":"{}",' +
+        '"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' +
+        '"internedValues":[]}}').format(ext_id, start, end)
+
+
+def const_review_payload(ext_id, start, end):
+    return (
+        'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
+        '"specs":[{{"type":"CommentThread",' +
+        '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+        + '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' +
+        '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
+        '"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
+                                                               end)
diff --git a/crawler b/crawler
index c48b5e3..d9e9b6b 100755
--- a/crawler
+++ b/crawler
@@ -24,8 +24,9 @@ import requests
 from time import sleep
 from random import randint
 from datetime import datetime, timezone
-import ExtensionCrawler.discover
-import ExtensionCrawler.archive
+from ExtensionCrawler.discover import *
+from ExtensionCrawler.archive import *
+from ExtensionCrawler.util import *
 import dateutil
 import dateutil.parser
 
@@ -114,23 +115,6 @@ class UpdateResult:
 def google_dos_protection(max=3):
     sleep(randint(1, max) * .5)
 
-
-def write_text(dir, fname, text):
-    with open(os.path.join(dir, fname), 'w') as f:
-        f.write(text)
-
-
-def store_request_metadata(dir, fname, request):
-    write_text(dir, fname + ".headers", str(request.headers))
-    write_text(dir, fname + ".status", str(request.status_code))
-    write_text(dir, fname + ".url", str(request.url))
-
-
-def store_request_text(dir, fname, request):
-    write_text(dir, fname, request.text)
-    store_request_metadata(dir, fname, request)
-
-
 def log(verbose, msg):
     if verbose:
         sys.stdout.write(msg)
@@ -150,37 +134,6 @@ def update_overview(dir, verbose, ext_id):
     return RequestResult(res)
 
 
-def httpdate(dt):
-    weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
-    month = [
-        "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
-        "Nov", "Dec"
-    ][dt.month - 1]
-    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
-        weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
-
-
-def last_modified_utc_date(path):
-    if path is "":
-        return ""
-    return os.path.split(os.path.dirname(path))[1]
-
-
-def last_modified_http_date(path):
-    if path is "":
-        return ""
-    return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
-
-
-def last_crx(dir, extid):
-    old_archives = sorted(
-        glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
-    last_archive = ""
-    if old_archives != []:
-        last_archive = old_archives[-1]
-    return last_archive
-
-
 def validate_crx_response(res, extfilename):
     regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
     if not 'Content-Type' in res.headers: