forked from BrowserSecurity/ExtensionCrawler
Added python-based sqlite merger.
This commit is contained in:
parent
11604c0fa5
commit
44756e44d6
|
@ -10,6 +10,4 @@ BASEDIR=${1:-/shared/brucker_research1/Shared/BrowserExtensions/data}
|
|||
DBPATH=${2:-~/aa-ac.sqlite}
|
||||
EXTENSIONCRAWLER=${3:-~/ExtensionCrawler}
|
||||
|
||||
find "$BASEDIR"/aa* -name "*.sqlite" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs.sh" "{}" "$DBPATH" \;
|
||||
find "$BASEDIR"/ab* -name "*.sqlite" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs.sh" "{}" "$DBPATH" \;
|
||||
find "$BASEDIR"/ac* -name "*.sqlite" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs.sh" "{}" "$DBPATH" \;
|
||||
find "$BASEDIR" -mindepth 1 -maxdepth 1 -name "a[a-c]*" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs" "{}" "$DBPATH" \;
|
||||
|
|
|
@ -45,7 +45,7 @@ fi
|
|||
date +"* Start Creating full.sqlite Data Base (%c) using $SQLITE" | tee -a $LOG
|
||||
# Update full database
|
||||
rm -f $ARCHIVE/db/full.sqlite
|
||||
find "$ARCHIVE"/data/ -name "*.sqlite" -exec "$CRAWLERHOME/scripts/merge_dbs.sh" "{}" "$ARCHIVE"/db/full.sqlite \; &> $LOGPREFIX-sqlite-full.log
|
||||
"$CRAWLERHOME/scripts/merge_dbs" "$ARCHIVE/data" "$ARCHIVE/db/full.sqlite" &> $LOGPREFIX-sqlite-full.log
|
||||
if [ $? -ne "0" ]; then
|
||||
echo " Creation of full.sqlite failed - see log file for details" | tee -a $LOG
|
||||
else
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sqlite3
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import fnmatch
|
||||
|
||||
MAX_ATTACHED_DBS = 10
|
||||
|
||||
|
||||
def merge_and_detach(con, currently_attached, mappings, columnnames):
|
||||
for db in currently_attached:
|
||||
for (fromtable, totable) in mappings.items():
|
||||
con.execute("INSERT INTO {} SELECT {} from {}.{};".format(
|
||||
totable, ",".join(columnnames[fromtable]), db, fromtable))
|
||||
|
||||
con.commit()
|
||||
for db in currently_attached:
|
||||
con.execute("DETACH DATABASE {}".format(db))
|
||||
|
||||
|
||||
def get_mappings_and_column_names(dbpath):
|
||||
with sqlite3.connect(dbpath) as con:
|
||||
create_strings = con.execute(
|
||||
"""select name,sql from sqlite_master where type='table'"""
|
||||
""" and name NOT LIKE '%!_segments' escape '!'"""
|
||||
""" and name NOT LIKE '%!_segdir' escape '!'"""
|
||||
""" and name NOT LIKE '%!_docsize' escape '!'"""
|
||||
""" and name NOT LIKE '%!_stat' escape '!'"""
|
||||
""";""")
|
||||
mappings = {}
|
||||
columnnames = {}
|
||||
for (name, create_string, ) in create_strings:
|
||||
if re.match("^CREATE VIRTUAL TABLE ([^\s]+) using fts",
|
||||
create_string):
|
||||
continue
|
||||
|
||||
cnames = [
|
||||
name
|
||||
for (_, name, _, _, _, _) in con.execute(
|
||||
"pragma table_info({});".format(name)).fetchall()
|
||||
]
|
||||
if name.endswith("_content"):
|
||||
mappings[name] = name[:-len("_content")]
|
||||
columnnames[name] = cnames[1:]
|
||||
else:
|
||||
mappings[name] = name
|
||||
columnnames[name] = cnames
|
||||
|
||||
return (mappings, columnnames)
|
||||
|
||||
|
||||
def merge_schema(con, dbpath):
|
||||
con.execute("ATTACH DATABASE ? as schemadb;", (dbpath, ))
|
||||
create_strings = con.execute(
|
||||
"""select sql from schemadb.sqlite_master where type='table'"""
|
||||
""" and name NOT LIKE '%!_segments' escape '!'"""
|
||||
""" and name NOT LIKE '%!_segdir' escape '!'"""
|
||||
""" and name NOT LIKE '%!_docsize' escape '!'"""
|
||||
""" and name NOT LIKE '%!_stat' escape '!'"""
|
||||
""" and name NOT LIKE '%!_content' escape '!'"""
|
||||
""";"""
|
||||
)
|
||||
|
||||
for (create_string, ) in create_strings:
|
||||
print(create_string)
|
||||
con.execute(create_string)
|
||||
|
||||
con.execute("DETACH DATABASE schemadb;")
|
||||
|
||||
|
||||
def find(pattern, path):
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, pattern):
|
||||
yield os.path.join(root, name)
|
||||
|
||||
def help():
|
||||
print("Usage: merge_dbs DBSPATH TODB")
|
||||
print(" DBSPATH the folder containing the *.sqlite files")
|
||||
print(" (searched recursivly)")
|
||||
print(" TODB the destination sqlite file")
|
||||
|
||||
def main(argv):
|
||||
if len(argv) != 2:
|
||||
help()
|
||||
sys.exit(1)
|
||||
dbspath, todb = argv[:2]
|
||||
|
||||
print("Using sqlite3 version {}".format(sqlite3.sqlite_version))
|
||||
|
||||
if os.path.isdir(dbspath):
|
||||
sqlitepaths = list(find("*.sqlite", dbspath))
|
||||
else:
|
||||
sqlitepaths = [dbspath]
|
||||
|
||||
firstdb = sqlitepaths[0]
|
||||
mappings, columnnames = get_mappings_and_column_names(firstdb)
|
||||
print("Mappings:")
|
||||
print(json.dumps(mappings, indent=4))
|
||||
print("Column names:")
|
||||
print(json.dumps(columnnames, indent=4))
|
||||
|
||||
with sqlite3.connect(todb) as con:
|
||||
if con.execute("SELECT COUNT(*) FROM sqlite_master;").fetchone()[
|
||||
0] == 0:
|
||||
print("Merging schema from {}".format(firstdb))
|
||||
merge_schema(con, firstdb)
|
||||
|
||||
currently_attached = []
|
||||
for i, dbpath in enumerate(sqlitepaths):
|
||||
dbname = "db{}".format(i)
|
||||
print("Attaching {}".format(dbpath))
|
||||
con.execute("ATTACH DATABASE ? as ?", (dbpath, dbname))
|
||||
currently_attached += [dbname]
|
||||
if len(currently_attached) % MAX_ATTACHED_DBS == 0 or i + 1 == len(
|
||||
sqlitepaths):
|
||||
merge_and_detach(con, currently_attached, mappings,
|
||||
columnnames)
|
||||
currently_attached = []
|
||||
con.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
|
@ -16,4 +16,4 @@ override_vars
|
|||
|
||||
set -u
|
||||
|
||||
find $DBDIR -name "*.sqlite" -exec "$EXTENSIONCRAWLERDIR/scripts/merge_dbs.sh" "{}" "$OUTDBPATH" \;
|
||||
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$DBDIR" "$OUTDBPATH"
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
#!/bin/bash
|
||||
FROM_DB=$1
|
||||
TO_DB=$2
|
||||
|
||||
if [ -z $FROM_DB ] || ! [ -f $FROM_DB ]; then
|
||||
echo "source db not provided or does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z $TO_DB ]; then
|
||||
echo "destination db not provided"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! [ -f $TO_DB ]; then
|
||||
echo "Creating $TO_DB ..."
|
||||
sqlite3 $FROM_DB .schema | grep -Eiv \
|
||||
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_content(\"|')?\(" \
|
||||
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_docsize(\"|')?\(" \
|
||||
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_segments(\"|')?\(" \
|
||||
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_stat(\"|')?\(" \
|
||||
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_segdir(\"|')?\(" \
|
||||
| sqlite3 $TO_DB
|
||||
fi
|
||||
|
||||
echo "Merging $FROM_DB into $TO_DB..."
|
||||
|
||||
sqlite3 $FROM_DB .dump | grep -Eiv \
|
||||
-e "^CREATE TABLE" \
|
||||
-e "^INSERT INTO (\"|')?sqlite_master(\"|')?" \
|
||||
-e "^INSERT INTO (\"|')?[a-z]+_segments(\"|')? " \
|
||||
-e "^INSERT INTO (\"|')?[a-z]+_segdir(\"|')? " \
|
||||
-e "^INSERT INTO (\"|')?[a-z]+_docsize(\"|')? " \
|
||||
-e "^INSERT INTO (\"|')?[a-z]+_stat(\"|')? " \
|
||||
| sed -r "s/^INSERT INTO ([a-z]+)_content VALUES\([[:digit:]]+,/INSERT INTO \1 VALUES(/I" \
|
||||
| sqlite3 $TO_DB
|
|
@ -19,5 +19,5 @@ override_vars
|
|||
|
||||
find "$DBDIR" -name "$(task_id_to_letter_256 $SGE_TASK_ID)*.sqlite" -print0 | while IFS= read -r -d '' file; do
|
||||
DBNAME=$(basename "$file")
|
||||
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs.sh" "$file" "$OUTDBPATH/${DBNAME:0:3}/$DBNAME"
|
||||
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$file" "$OUTDBPATH/${DBNAME:0:3}/$DBNAME"
|
||||
done
|
||||
|
|
Loading…
Reference in New Issue