Added python-based sqlite merger.

This commit is contained in:
Michael Herzberg 2017-07-28 16:47:25 +01:00
parent 11604c0fa5
commit 44756e44d6
6 changed files with 131 additions and 42 deletions

View File

@ -10,6 +10,4 @@ BASEDIR=${1:-/shared/brucker_research1/Shared/BrowserExtensions/data}
DBPATH=${2:-~/aa-ac.sqlite}
EXTENSIONCRAWLER=${3:-~/ExtensionCrawler}
find "$BASEDIR"/aa* -name "*.sqlite" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs.sh" "{}" "$DBPATH" \;
find "$BASEDIR"/ab* -name "*.sqlite" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs.sh" "{}" "$DBPATH" \;
find "$BASEDIR"/ac* -name "*.sqlite" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs.sh" "{}" "$DBPATH" \;
find "$BASEDIR" -mindepth 1 -maxdepth 1 -name "a[a-c]*" -exec "$EXTENSIONCRAWLER/scripts/merge_dbs" "{}" "$DBPATH" \;

View File

@ -45,7 +45,7 @@ fi
date +"* Start Creating full.sqlite Data Base (%c) using $SQLITE" | tee -a $LOG
# Update full database
rm -f $ARCHIVE/db/full.sqlite
find "$ARCHIVE"/data/ -name "*.sqlite" -exec "$CRAWLERHOME/scripts/merge_dbs.sh" "{}" "$ARCHIVE"/db/full.sqlite \; &> $LOGPREFIX-sqlite-full.log
"$CRAWLERHOME/scripts/merge_dbs" "$ARCHIVE/data" "$ARCHIVE/db/full.sqlite" &> $LOGPREFIX-sqlite-full.log
if [ $? -ne "0" ]; then
echo " Creation of full.sqlite failed - see log file for details" | tee -a $LOG
else

127
scripts/merge_dbs Executable file
View File

@ -0,0 +1,127 @@
#!/usr/bin/env python3
import sqlite3
import sys
import os
import re
import json
import fnmatch
MAX_ATTACHED_DBS = 10
def merge_and_detach(con, currently_attached, mappings, columnnames):
for db in currently_attached:
for (fromtable, totable) in mappings.items():
con.execute("INSERT INTO {} SELECT {} from {}.{};".format(
totable, ",".join(columnnames[fromtable]), db, fromtable))
con.commit()
for db in currently_attached:
con.execute("DETACH DATABASE {}".format(db))
def get_mappings_and_column_names(dbpath):
with sqlite3.connect(dbpath) as con:
create_strings = con.execute(
"""select name,sql from sqlite_master where type='table'"""
""" and name NOT LIKE '%!_segments' escape '!'"""
""" and name NOT LIKE '%!_segdir' escape '!'"""
""" and name NOT LIKE '%!_docsize' escape '!'"""
""" and name NOT LIKE '%!_stat' escape '!'"""
""";""")
mappings = {}
columnnames = {}
for (name, create_string, ) in create_strings:
if re.match("^CREATE VIRTUAL TABLE ([^\s]+) using fts",
create_string):
continue
cnames = [
name
for (_, name, _, _, _, _) in con.execute(
"pragma table_info({});".format(name)).fetchall()
]
if name.endswith("_content"):
mappings[name] = name[:-len("_content")]
columnnames[name] = cnames[1:]
else:
mappings[name] = name
columnnames[name] = cnames
return (mappings, columnnames)
def merge_schema(con, dbpath):
con.execute("ATTACH DATABASE ? as schemadb;", (dbpath, ))
create_strings = con.execute(
"""select sql from schemadb.sqlite_master where type='table'"""
""" and name NOT LIKE '%!_segments' escape '!'"""
""" and name NOT LIKE '%!_segdir' escape '!'"""
""" and name NOT LIKE '%!_docsize' escape '!'"""
""" and name NOT LIKE '%!_stat' escape '!'"""
""" and name NOT LIKE '%!_content' escape '!'"""
""";"""
)
for (create_string, ) in create_strings:
print(create_string)
con.execute(create_string)
con.execute("DETACH DATABASE schemadb;")
def find(pattern, path):
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
yield os.path.join(root, name)
def help():
print("Usage: merge_dbs DBSPATH TODB")
print(" DBSPATH the folder containing the *.sqlite files")
print(" (searched recursivly)")
print(" TODB the destination sqlite file")
def main(argv):
if len(argv) != 2:
help()
sys.exit(1)
dbspath, todb = argv[:2]
print("Using sqlite3 version {}".format(sqlite3.sqlite_version))
if os.path.isdir(dbspath):
sqlitepaths = list(find("*.sqlite", dbspath))
else:
sqlitepaths = [dbspath]
firstdb = sqlitepaths[0]
mappings, columnnames = get_mappings_and_column_names(firstdb)
print("Mappings:")
print(json.dumps(mappings, indent=4))
print("Column names:")
print(json.dumps(columnnames, indent=4))
with sqlite3.connect(todb) as con:
if con.execute("SELECT COUNT(*) FROM sqlite_master;").fetchone()[
0] == 0:
print("Merging schema from {}".format(firstdb))
merge_schema(con, firstdb)
currently_attached = []
for i, dbpath in enumerate(sqlitepaths):
dbname = "db{}".format(i)
print("Attaching {}".format(dbpath))
con.execute("ATTACH DATABASE ? as ?", (dbpath, dbname))
currently_attached += [dbname]
if len(currently_attached) % MAX_ATTACHED_DBS == 0 or i + 1 == len(
sqlitepaths):
merge_and_detach(con, currently_attached, mappings,
columnnames)
currently_attached = []
con.commit()
if __name__ == "__main__":
main(sys.argv[1:])

View File

@ -16,4 +16,4 @@ override_vars
set -u
find $DBDIR -name "*.sqlite" -exec "$EXTENSIONCRAWLERDIR/scripts/merge_dbs.sh" "{}" "$OUTDBPATH" \;
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$DBDIR" "$OUTDBPATH"

View File

@ -1,36 +0,0 @@
#!/bin/bash
FROM_DB=$1
TO_DB=$2
if [ -z $FROM_DB ] || ! [ -f $FROM_DB ]; then
echo "source db not provided or does not exist"
exit 1
fi
if [ -z $TO_DB ]; then
echo "destination db not provided"
exit 1
fi
if ! [ -f $TO_DB ]; then
echo "Creating $TO_DB ..."
sqlite3 $FROM_DB .schema | grep -Eiv \
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_content(\"|')?\(" \
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_docsize(\"|')?\(" \
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_segments(\"|')?\(" \
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_stat(\"|')?\(" \
-e "^CREATE TABLE IF NOT EXISTS (\"|')?[a-z]+_segdir(\"|')?\(" \
| sqlite3 $TO_DB
fi
echo "Merging $FROM_DB into $TO_DB..."
sqlite3 $FROM_DB .dump | grep -Eiv \
-e "^CREATE TABLE" \
-e "^INSERT INTO (\"|')?sqlite_master(\"|')?" \
-e "^INSERT INTO (\"|')?[a-z]+_segments(\"|')? " \
-e "^INSERT INTO (\"|')?[a-z]+_segdir(\"|')? " \
-e "^INSERT INTO (\"|')?[a-z]+_docsize(\"|')? " \
-e "^INSERT INTO (\"|')?[a-z]+_stat(\"|')? " \
| sed -r "s/^INSERT INTO ([a-z]+)_content VALUES\([[:digit:]]+,/INSERT INTO \1 VALUES(/I" \
| sqlite3 $TO_DB

View File

@ -19,5 +19,5 @@ override_vars
find "$DBDIR" -name "$(task_id_to_letter_256 $SGE_TASK_ID)*.sqlite" -print0 | while IFS= read -r -d '' file; do
DBNAME=$(basename "$file")
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs.sh" "$file" "$OUTDBPATH/${DBNAME:0:3}/$DBNAME"
"$EXTENSIONCRAWLERDIR/scripts/merge_dbs" "$file" "$OUTDBPATH/${DBNAME:0:3}/$DBNAME"
done