User Tools

Site Tools


import_process

Import Process

This is generic overview of the import process from an HTML source into a database. I won't go into details or specifics because I don't want to support the code, plus it's hacked together anyway.

Here's pretty much how it operates though:

  • Fetch all HTML sources
  • Convert HTML sources to DOM (google-chrome-stable –headless –dump-dom http://scriptures.nephi.org)
  • Insert a custom JavaScript source file using JQuery to remove HTML tags and get scripture text
  • Iterate over the modified HTML files
  • Custom JavaScript makes an AJAX call to a script which sanitizes data and imports it into the database

That's it!

Doing some undocumented code dumps here:

<script src='jquery-3.3.1.min.js'></script><script type='text/javascript'>var v=1;var scripture=new Object();for(v=1;v<$('p.verse').length+1;v+=1){$('#p'+v+' span.verse-number.verse').remove();$('#p'+v+' span.para-mark').remove();$('#p'+v+' sup.studyNoteMarker.dontHighlight').remove();scripture.bcv=$('title').text()+':'+v;scripture.text=$('#p'+v).text();$.ajax({url:'http://localhost:8080/',type:'POST',async:false,data:scripture})}</script>

AJAX:

<?php

	error_reporting(E_ALL & ~E_STRICT & ~E_DEPRECATED);
	ini_set('error_log', 'error.log');

	$insert = false;
	$update = true;

	if($insert)
		$pdo_dsn = "sqlite:import.db";
	elseif($update)
		$pdo_dsn = "sqlite:update.db";
	else
		exit;

	$legacy_db = new PDO("sqlite:lds-scriptures-sqlite3.db");
	$legacy_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);

	$update_db = new PDO("sqlite:update.db");
	$update_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);

	// Sample queries

	// getOne():
	/*
	$foo = $db->query('SELECT id FROM dvds ORDER BY id DESC LIMIT 1;');
	$bar = $foo->fetchColumn();
	var_dump($bar);

	// getCol():
	$foo = $db->query('SELECT id FROM dvds ORDER BY id DESC LIMIT 10;');
	$bar = $foo->fetchAll(PDO::FETCH_COLUMN);

	// getAll():
	$foo = $db->query('SELECT * FROM dvds ORDER BY id DESC LIMIT 10;');
	$bar = $foo->fetchAll();
	print_r($bar);

	// getRow():
	$foo = $db->query('SELECT * FROM dvds ORDER BY id DESC LIMIT 10;');
	$bar = current($foo->fetchAll());
	print_r($bar);
	*/

	try {
		$db = new PDO($pdo_dsn);
		$db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);
	} catch (PDOException $e) {
    		echo "* Error!: ".$e->getMessage()."\n";
		exit(1);
	}

	extract($_POST);

	$bcv = htmlentities($bcv);
	$bcv = str_replace("&nbsp;", " ", $bcv);
	$bcv = str_replace("&mdash;", "--", $bcv); // "Joseph Smith--History"

	$text = htmlentities($text);
	$text = str_replace("&nbsp;", " ", $text);
	$text = str_replace("&lsquo;", "'", $text);
	$text = str_replace("&rsquo;", "'", $text);
	$text = str_replace("&ldquo;", "\"", $text);
	$text = str_replace("&rdquo;", "\"", $text);
	$text = str_replace("&ndash;", "-", $text);
	$text = str_replace("&mdash;", "--", $text);
	$text = str_replace("&hellip;", "...", $text); // Joseph Smith--History # 41910
	$text = str_replace("&aelig;", "ae", $text);
	$text = str_replace("&AElig;", "Ae", $text);

	// Legacy has 'Psalms 1:1' but new has 'Psalm 1:1' -- use the old one when doing a lookup
	$bcv = str_replace("Psalm ", "Psalms ", $bcv);

	// "Solomon's Song" changed to "Song of Solomon"
	$bcv = str_replace("Song of Solomon", "Solomon's Song", $bcv);

	if($bcv == "Joseph Smith--History 1:71")
		$text = str_replace("commanded.*", "commanded.", $text); // Joseph Smith--History 1:71

	if($update) {

		$q_bcv = $db->quote($bcv);
		$q_scripture_text = $db->quote($text);

		$sql = "SELECT verse_id, scripture_text FROM scriptures WHERE verse_title = $q_bcv;";

		$rs = $legacy_db->query($sql);
		$rows = $rs->fetchAll();

		if(count($rows) == 0) {
			error_log("Cannot find verse ID for $bcv");
			exit;
		}

		$row = current($rows);

		$legacy_id = $row['verse_id'];
		$legacy_text = $row['scripture_text'];

		if($legacy_text != $text) {
			$sql = "UPDATE verses SET scripture_text = $q_scripture_text WHERE id = $legacy_id;";
			file_put_contents("update_sqlite.sql", "$sql\n", FILE_APPEND);
			// $update_db->query($sql);
			error_log($bcv);
		}

	}
import_process.txt · Last modified: 2019/03/24 16:13 by steve