hi,

so i've been looking for a string matching algorithm that gives me a "rating" based on how close the 2 strings match.. luckily, i googled and read this article: how to strike a match

here's the original java class code

package strike_a_match;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;

public class LetterPairSimilarity {
	
	/** @return lexical similarity value in the range [0,1] */
	public static double compareStrings(String str1, String str2) {
		ArrayList <String> pairs1 = wordLetterPairs(str1.toUpperCase());
		ArrayList <String> pairs2 = wordLetterPairs(str2.toUpperCase());
		int intersection = 0;
		int union = pairs1.size() + pairs2.size();
		for (int i=0; i<pairs1.size(); i++) {
			Object pair1=pairs1.get(i);
			for(int j=0; j<pairs2.size(); j++) {
				Object pair2=pairs2.get(j);
				if (pair1.equals(pair2)) {
					intersection++;
					pairs2.remove(j);
					break;
				}
			}
		}
		return (2.0*intersection)/union;
	}
	/** @return an ArrayList of 2-character Strings. */
	private static ArrayList <String> wordLetterPairs(String str) {
		ArrayList <String> allPairs = new ArrayList<String>();
		// Tokenize the string and put the tokens/words into an array
		String[] words = str.split(" ");
		// For each word
		for (int w=0; w < words.length; w++) {
			// Find the pairs of characters
			String[] pairsInWord = letterPairs(words[w]);
			for (int p=0; p < pairsInWord.length; p++) {
				allPairs.add(pairsInWord[p]);
			}
		}
		return allPairs;
	}
	/** @return an array of adjacent letter pairs contained in the input string */
	private static String[] letterPairs(String str) {
		int numPairs = str.length()-1;
		String[] pairs = new String[numPairs];
		for (int i=0; i<numPairs; i++) {
			pairs[i] = str.substring(i,i+2);
		}
		return pairs;
	}

}

now, here is my code as i tried to convert it to php

<?php
class LetterPairSimilarity{

	function compareStrings($str1, $str2){
		$pairs1 = array(); //ArrayList <String>
		$pairs2 = array(); //ArrayList <String>
		$pairs1 = $this->wordLetterPairs(strtoupper($str1));
		$pairs2 = $this->wordLetterPairs(strtoupper($str2));
		$intersection = 0; //int  
		$union = count($pairs1) + count($pairs2); //int ->sizes of arrays
		for ($i=0; $i<count($pairs1); $i++) {
			$pair1 = $pairs1[$i];
			for($j=0; $j<count($pairs2); $j++) {
				$pair2 = $pairs2[$j];
				if (strcasecmp($pair1,$pair2) == 0) {
					$intersection++;
					unset($pairs2[$j]); //pairs2.remove(j)
					break;
				}
			}
		}
		echo "RATING: ".(2*$intersection)/$union;
		return (2*$intersection)/$union;
	}
	
	function wordLetterPairs($str){
		$allPairs = array (); //ArrayList <String>
		$words = array ();
		// Tokenize the string and put the tokens/words into an array
		$words = explode(" ", $str); //String[] words = $str.split("\\s")
		// For each word
		for ($w=0; $w < count($words); $w++) {
			// Find the pairs of characters
			$pairsInWord = array ();
			$pairsInWord = $this->letterPairs($words[$w]); // String[] pairsInWord = letterPairs(words[w])
			for ($p=0; $p < count($pairsInWord); $p++) { //pairsInWord.length
				array_push($allPairs, $pairsInWord[$p]); //allPairs.add(pairsInWord[p]);
			}
		}
		return $allPairs;
	}
	
	function letterPairs($str){
		$numPairs = strlen($str)-1; //int numPairs = str.length()-1
		$pairs = array (); //new String[$numPairs]
		for ($i=0; $i<$numPairs; $i++) {
			$pairs[$i] = substr($str, $i ,$i+2); //$pairs[$i] = $str.substring(i,i+2);
		}
		return $pairs;
	}
}
?>

----------
my issues:

in the article i have cited, "france" and "french" gives me 0.4, this works well with my php code.. but when i try "france" and "france" (obviously the same string) this should give me 1.0.. sad thing tho, it gives me either a 0.6 or a 0.5...


please.. i DESPERATELY need help

What was the answer?

i traced the code, compared it to the java code,and found out some language barriers.. (lol) anyway, here's the important thing:

<?php
class LetterPairSimilarity {
	
	/** @return lexical similarity value in the range [0,1] */
	function compareStrings($str1, $str2) {
		$pairs1 = $this->wordLetterPairs(strtoupper ($str1));
		$pairs2 = $this->wordLetterPairs(strtoupper ($str2));
		$intersection = 0;
		$union = sizeof($pairs1) + sizeof($pairs2);
		for ($i=0; $i<sizeof($pairs1); $i++) {
			$pair1=$pairs1[$i];
			for($j=0; $j<sizeof($pairs2); $j++) {
				$pair2=$pairs2[$j];
				if ($pair1 === $pair2) {
					$intersection++;
					//unset($pairs2[$j]);
					break;
				}
			}
		}
		echo "RATING: ".(2.0*$intersection)/$union;
		return (2.0*$intersection)/$union;
	}
	/** @return an ArrayList of 2-character Strings. */
	function wordLetterPairs($str) {
		$allPairs = array();
		// Tokenize the string and put the tokens/words into an array
		$words = explode(" ", $str);
		// For each word
		for ($w=0; $w<sizeof($words); $w++) {
			// Find the pairs of characters
			$pairsInWord = array();
			$pairsInWord = $this->letterPairs($words[$w]);
			for ($p=0; $p<sizeof($pairsInWord); $p++) {
				$allPairs[$p] = $pairsInWord[$p];
			}
		}
		return $allPairs;
	}
	/** @return an array of adjacent letter pairs contained in the input string */
	function letterPairs($str) {
		$numPairs = strlen($str)-1;
		$pairs = array();
		for ($i=0; $i<$numPairs; $i++) {
			$pairs[$i] = substr($str, $i, 2);
		}
		return $pairs;
	}

}
?>

hope this helps!

This question has already been answered. Start a new discussion instead.