Comparing Text

The code below is used by theholyscriptures.info to compare similar verses. For example, sections of Isaiah are quoted in the Book of Mormon (e.g. 2 Nephi 24:2 Δ Isaiah 14:2), and these verses can be shown side by side thanks to the compare function shown below. The compare function is designed to work on plain text and produce a result with minimal markup: only ins and del tags are added. To compare xml or html text, the < and > characters should first be converted to html entities (e.g. &lt; and &gt;).

The difficult part of this algorithm is determining how many words to “look ahead”. My solution was brute force, checking all possibilities and then choosing the lookahead number that produces the least markup. More information is given in the code comments.

I hope someone finds this useful in other applications. If you can suggest improvements to the code or algorithm, I am very interested!

// This compare routine takes two similar texts and returns
// the difference with <ins> and <del> markup.
String.prototype.compare = function(compareTo) {
  // Author: Jay Mackley - 2009, TheHolyScriptures.info. For public use with attribution.
  function Compare(mySource, myTarget, lookAhead) {
    // First clean up and normalize the text.
    //
    // Trim spaces front and back. Make sure it's all one line per verse.
    mySource = mySource.replace(/^\s*|\s*$|[\n\r]/g,"");
    myTarget = myTarget.replace(/^\s*|\s*$|[\n\r]/g,"");
    // Remove all markup tags
    mySource = mySource.replace(/<(.|\n)*?>/g,"");
    myTarget = myTarget.replace(/<(.|\n)*?>/g,"");
    // Remove extra spaces and treat long dashes as a word separator.
    // Include soft hyphen (00AD), figure dash (2012), em dash (2013), en dash (2014)
    mySource = mySource.replace(/\s+|(--)|[\u00AD\u2012\u2013\u2014]/g," ");
    myTarget = myTarget.replace(/\s+|(--)|[\u00AD\u2012\u2013\u2014]/g," ");
    // Move cleaned text into arrays that still contain word punctuation.
    // These arrays are used to build the final result but not for comparison.
    var cleanSourceArray = mySource.split(" ");
    var cleanTargetArray = myTarget.split(" ");
    //
    // Remove all other punctuation and caps so only plain words are left to compare.
    // Create these arrays for making the actual text comparisons word by word
    var sourceWords = mySource.replace(/[,:?!.;]/g,"").toLowerCase().split(" ");
    var targetWords = myTarget.replace(/[,:?!.;]/g,"").toLowerCase().split(" ");
    //
    var myDelta = "";   // Holds the comparison results text with markup
    var bDone;          // Set to true when all words have been examined
    var aMax;           // Maximum words in source array
    var bMax;           // Maximum words in target array
    var aLookAhead;     // How many source words to look ahead for comparison
    var bLookAhead;     // How many target words to look ahead for comparison
    var aFound;         // Indicates a word match in source
    var bFound;         // Indicates a word match in target
    var a = 0;          // Source array pointer
    var b = 0;          // Target array pointer
    var la;             // Source array pointer
    var lb;             // Target array pointer

    bDone = false;
    while (bDone === false) {
      if (sourceWords[b] === targetWords[a]) {
        myDelta = myDelta+cleanTargetArray[a]+" ";
        b++;
        a++;
      } else {
        if ((b + lookAhead) > cleanSourceArray.length-1) {
          bLookAhead = cleanSourceArray.length-1;
        } else {
          bLookAhead = b + lookAhead;
        }
        if ((a + lookAhead) > cleanTargetArray.length-1) {
          aLookAhead = cleanTargetArray.length-1;
        } else {
          aLookAhead = a + lookAhead;
        }
        bFound = false;
        aFound = false;
        for (lb = b; lb <= bLookAhead; lb++) {
          if (targetWords[a] === sourceWords[lb]) {
            bFound = true;
            break;
          }
        }
        for (la = a; la <= aLookAhead; la++) {
          if (sourceWords[b] === targetWords[la]) {
            aFound = true;
            break;
          }
        }
        if (bFound && aFound) {
          if ((lb - b) > (la - a)) {
            for (i = a; i < la; i++) {
              myDelta = myDelta+"<ins>"+cleanTargetArray[i]+"</ins> ";
            }
            a = la;
          } else {
            for (i = b; i < lb; i++) {
              myDelta = myDelta+"<del>"+cleanSourceArray[i]+"</del> ";
            }
            b = lb;
          }
        } else if (bFound === true) {
          for (i = b; i < lb; i++) {
            myDelta = myDelta+"<del>"+cleanSourceArray[i]+"</del> ";
          }
          b = lb;
        } else if (aFound === true) {
          for (i = a; i < la; i++) {
            myDelta = myDelta+"<ins>"+cleanTargetArray[i]+"</ins> ";
          }
          a = la;
        } else {
          if ((b >= cleanSourceArray.length-1) && (a < cleanTargetArray.length-1)) {
            myDelta = myDelta+"<ins>"+cleanTargetArray[a]+"</ins> ";
            b = cleanSourceArray.length-1;
            a++;
          } else {
            if ((a >= cleanTargetArray.length-1) && (b < cleanSourceArray.length-1)) {
              myDelta = myDelta+"<del>"+cleanSourceArray[b]+"</del> ";
              a = cleanTargetArray.length-1;
              b++;
            } else {
              myDelta = myDelta+"<ins>"+cleanTargetArray[a]+"</ins> ";
              myDelta = myDelta+"<del>"+cleanSourceArray[b]+"</del> ";
              a++;
              b++;
            }
          }
        }
      }
      if ((b > cleanSourceArray.length-1) || (a > cleanTargetArray.length-1)) bDone = true;
    }
    if (cleanSourceArray.length < cleanTargetArray.length) {
      // The target text has words remaining, so add them on the end as insertions
      for (i = a; i < cleanTargetArray.length; i++) {
        myDelta += "<ins>"+cleanTargetArray[i]+"</ins> ";
      }
    } else if (cleanTargetArray.length < cleanSourceArray.length) {
      // The source text has words remaining, so add them on the end as deletions.
      for (i = b; i < cleanSourceArray.length; i++) {
        myDelta += "<del>"+cleanSourceArray[i]+"</del> ";
      }
    }
    myDelta = RemoveRedundantTags(myDelta)
    return myDelta;
  } // end of Compare function
  //
  function RemoveRedundantTags(deltaMarkup) {
    var state = "";
    var prevState = "";
    var words = deltaMarkup.split(" ");
    for (var p = 1; p < words.length; p++) {
      prevState = state;
      if (words[p].left(4) === "<ins") {
          state = "ins";
      } else if (words[p].left(4) === "<del") {
        state = "del";
      } else {
        state = "";
      }
      if ((state === "ins" && prevState === "ins") || (state === "del" && prevState === "del")) {
        words[p-1] = words[p-1].substr(0,words[p-1].length-6);
        words[p] = words[p].substr(5,words[p].length-5);
      }
    }
    return words.join(" ");
  }
  var sourceMax = this.count(" ")+1;
  var targetMax = compareTo.count(" ")+1;
  var max;
  if (targetMax > sourceMax) {
    max = targetMax;
  } else {
    max = sourceMax;
  }
  var bestCnt = max;
  var currCnt;
  var bestLookAhead = 1;
  // Examine all possibilities to determine the best lookahead value
  for (var testLookAhead = 1; testLookAhead <= max; testLookAhead++) {
    currCnt = Compare(this, compareTo, testLookAhead).count("</");
    if (currCnt < bestCnt) {
      bestCnt = currCnt;
      bestLookAhead = testLookAhead;
    }
  }
  return Compare(this, compareTo, bestLookAhead);
}
// Include the left/count functions in this listing for completeness
String.prototype.count=function(myString) {
  var regexp = new RegExp(myString, "g");
  return this.replace(regexp, myString+"*").length - this.length;
}
String.prototype.left = function(p) {
  if (this.length < p) {
    return this;
  } else {
    return this.substr(0,p);
  }
}

Leave a Reply

Your email address will not be published. Required fields are marked *