import_process
Import Process
This is generic overview of the import process from an HTML source into a database. I won't go into details or specifics because I don't want to support the code, plus it's hacked together anyway.
Here's pretty much how it operates though:
- Fetch all HTML sources
- Convert HTML sources to DOM (google-chrome-stable –headless –dump-dom http://scriptures.nephi.org)
- Insert a custom JavaScript source file using JQuery to remove HTML tags and get scripture text
- Iterate over the modified HTML files
- Custom JavaScript makes an AJAX call to a script which sanitizes data and imports it into the database
That's it!
Doing some undocumented code dumps here:
<script src='jquery-3.3.1.min.js'></script><script type='text/javascript'>var v=1;var scripture=new Object();for(v=1;v<$('p.verse').length+1;v+=1){$('#p'+v+' span.verse-number.verse').remove();$('#p'+v+' span.para-mark').remove();$('#p'+v+' sup.studyNoteMarker.dontHighlight').remove();scripture.bcv=$('title').text()+':'+v;scripture.text=$('#p'+v).text();$.ajax({url:'http://localhost:8080/',type:'POST',async:false,data:scripture})}</script>
AJAX:
<?php
error_reporting(E_ALL & ~E_STRICT & ~E_DEPRECATED);
ini_set('error_log', 'error.log');
$insert = false;
$update = true;
if($insert)
$pdo_dsn = "sqlite:import.db";
elseif($update)
$pdo_dsn = "sqlite:update.db";
else
exit;
$legacy_db = new PDO("sqlite:lds-scriptures-sqlite3.db");
$legacy_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);
$update_db = new PDO("sqlite:update.db");
$update_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);
// Sample queries
// getOne():
/*
$foo = $db->query('SELECT id FROM dvds ORDER BY id DESC LIMIT 1;');
$bar = $foo->fetchColumn();
var_dump($bar);
// getCol():
$foo = $db->query('SELECT id FROM dvds ORDER BY id DESC LIMIT 10;');
$bar = $foo->fetchAll(PDO::FETCH_COLUMN);
// getAll():
$foo = $db->query('SELECT * FROM dvds ORDER BY id DESC LIMIT 10;');
$bar = $foo->fetchAll();
print_r($bar);
// getRow():
$foo = $db->query('SELECT * FROM dvds ORDER BY id DESC LIMIT 10;');
$bar = current($foo->fetchAll());
print_r($bar);
*/
try {
$db = new PDO($pdo_dsn);
$db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);
} catch (PDOException $e) {
echo "* Error!: ".$e->getMessage()."\n";
exit(1);
}
extract($_POST);
$bcv = htmlentities($bcv);
$bcv = str_replace(" ", " ", $bcv);
$bcv = str_replace("—", "--", $bcv); // "Joseph Smith--History"
$text = htmlentities($text);
$text = str_replace(" ", " ", $text);
$text = str_replace("‘", "'", $text);
$text = str_replace("’", "'", $text);
$text = str_replace("“", "\"", $text);
$text = str_replace("”", "\"", $text);
$text = str_replace("–", "-", $text);
$text = str_replace("—", "--", $text);
$text = str_replace("…", "...", $text); // Joseph Smith--History # 41910
$text = str_replace("æ", "ae", $text);
$text = str_replace("Æ", "Ae", $text);
// Legacy has 'Psalms 1:1' but new has 'Psalm 1:1' -- use the old one when doing a lookup
$bcv = str_replace("Psalm ", "Psalms ", $bcv);
// "Solomon's Song" changed to "Song of Solomon"
$bcv = str_replace("Song of Solomon", "Solomon's Song", $bcv);
if($bcv == "Joseph Smith--History 1:71")
$text = str_replace("commanded.*", "commanded.", $text); // Joseph Smith--History 1:71
if($update) {
$q_bcv = $db->quote($bcv);
$q_scripture_text = $db->quote($text);
$sql = "SELECT verse_id, scripture_text FROM scriptures WHERE verse_title = $q_bcv;";
$rs = $legacy_db->query($sql);
$rows = $rs->fetchAll();
if(count($rows) == 0) {
error_log("Cannot find verse ID for $bcv");
exit;
}
$row = current($rows);
$legacy_id = $row['verse_id'];
$legacy_text = $row['scripture_text'];
if($legacy_text != $text) {
$sql = "UPDATE verses SET scripture_text = $q_scripture_text WHERE id = $legacy_id;";
file_put_contents("update_sqlite.sql", "$sql\n", FILE_APPEND);
// $update_db->query($sql);
error_log($bcv);
}
}
import_process.txt · Last modified: by steve
