I can't edit the post above, so here's a new post.
If you can download numpy, then this appears to be useful. The time for finding a single match among a data set of 60000 came down to 1 second:
import numpy
import time
def findclosest(a,X):
"""finds the vector in X most nearly parallel to a.
This is the computationally expensive area, so optimize here."""
if not len(X):
return None
current = X[0]
smallest = numpy.inner(a-current, a-current)
for item in X:
d = numpy.inner(a-item, a-item)
if d < smallest:
current = item
smallest = d
return current
def findclosesttaxicab(a,X):
"""finds the vector in X most nearly parallel to a using taxicab metric."""
if not len(X):
return None
current = X[0]
smallest = numpy.abs(a-current).sum()
for item in X:
d = numpy.abs(a-item).sum()
if d < smallest:
current = item
smallest = d
return current
def find_first_n(data,X, n=3):
"""
find_first_n(data, X, n=3) --> [(fit,vector, match), ...]
Finds up to first n vectors along with their matches, sorted by
order of fit."""
if len(data) <= n:
return data
else:
# A little ugly, but we can't use dictionaries, since vectors may not
# be unique AND because dictionaries don't have ordering.
# create a list of three winners: [(fit,vector,match),...] ordered
# by fit from smallest to largest.
winners = []
for i in range(n):
match = findclosest(data[i],X)
fit = numpy.inner(data[i],match)
for x in range(len(winners)):
if fit < winners[x][0]:
winners = (winners[:x] + [(fit,data[i],match)] + winners[x:])
break
else:
winners.append((fit,data[i],match))
return winners
def find_best_n(data, X, n=3):
"""
find_best_three(data, X, n=3) --> [(fit,vector, match), ...]
Finds best n vectors along with their matches, sorted by
order of fit."""
if len(data) <= n:
return data
# Prime the pump
winners = find_first_n(data,X,n)
# We've already examined the first n...
for i in range(n,len(data)):
match = findclosest(data[i],X)
fit = numpy.inner(data[i],match)
for x in range(n):
if fit < winners[x][0]:
winners = (winners[:x] + [(fit,data[i],match)] + winners[x:])[:n]
break
return winners
X = numpy.random.rand(60000,37)
data = numpy.random.rand(200,37)
# optimization metrics
start = time.time()
for i in data:
findclosest(i,X)
stop = time.time()
print "findclosest: Dt = %0.2f for %d matches in array of size %d" %\
(stop - start,len(data),len(X))
start = time.time()
for i in data:
findclosesttaxicab(i,X)
stop = time.time()
print "findclosesttaxicab: Dt = %0.2f for %d matches in array of size %d" %\
(stop - start,len(data),len(X))
# END optimization metrics
#
#best= find_best_n(data,X,3)
#print best, len(best)
>>>
findclosest: Dt = 198.63 for 200 matches in array of size 60000
findclosesttaxicab: Dt = 275.77 for 200 matches in array of size 60000 # surprise! I thought it would be faster
>>>