Hi everybody,

I've got a code which returns to a given text an inverse index. From a list of tokens, the function produces a list sorted by the frequency.

Example:
inverted_index(['the', 'house', ',', 'the', 'beer'])
[('the', [0, 3]), ('beer', [4]), ('house', [1]), (',', [2])]

Code:

def outp(txt):
    ind = {}
    for word in txt:
        if word not in ind.keys():
            i = txt.index(word)
            ind[word] = [i]
        else:
            i = txt.index(word, ind[word][-1]+1)
            ind[word].append(i)
        sorted_ind = sorted(ind.items(), key=lsort, reverse=True)
    return sorted_ind




def lsort(kv):
    return len(kv[1])

The code works, but it's very slow.
So, my question is: How could it be written s.t. the code is faster?

Thanks for any propositions, Darek

You are doing linear scan of all txt when you use index. Read up on dictionaries in Python.

If you need to keep indexes of the words, itertools.groupby might be more useful than Counter.

You can speed things up by taking line 10 out of the for loop.

This might be faster:

# create and word:index dictionary and a sorted list of index tuples

def create_index_dict(data_list):
    index_dict = {}
    for ix, word in enumerate(data_list):
        index_dict.setdefault(word, []).append(ix)
    return index_dict


data_list = ['the', 'house', ',', 'the', 'beer']
index_dict = create_index_dict(data_list)
print(index_dict)

'''
{'house': [1], 'the': [0, 3], 'beer': [4], ',': [2]}
'''

index_list = [(k, v) for k, v in index_dict.items()]
print(sorted(index_list, reverse=True))

'''
[('the', [0, 3]), ('house', [1]), ('beer', [4]), (',', [2])]
'''
commented: fast code +13

I tried to do it using itertools.groupby(), but hihe's method is faster. The reason is probably that groupby() needs an initial sort, while filling a dict doesn't. Here is the comparison

#!/usr/bin/env python
# -*-coding: utf8-*-
# compare two implementations of creating a sorted index list

data_list = ['the', 'house', ',', 'the', 'beer']

from itertools import count, groupby
from operator import itemgetter
ig0 = itemgetter(0)
ig1 = itemgetter(1)

def score(item):
    return (-len(item[1]), item[0])

def grib_func(data_seq):
    L = sorted(zip(data_seq, count(0)))
    L = ((key, [x[1] for x in group]) for key, group in groupby(L, key = ig0))
    return sorted(L, key = score)

# hihe's code    
def create_index_dict(data_list):
    index_dict = {}
    for ix, word in enumerate(data_list):
        index_dict.setdefault(word, []).append(ix)
    return index_dict

def hihe_func(data_seq):
    return sorted(create_index_dict(data_seq).items(), key = score)

# comparison code

print grib_func(data_list)
print hihe_func(data_list)

from timeit import Timer
for name in ("hihe_func", "grib_func"):
    tm = Timer("%s(data_list)"%name, "from __main__ import %s, data_list"%name)
    print "{0}: {1}".format(name, tm.timeit())

""" my output -->
[('the', [0, 3]), (',', [2]), ('beer', [4]), ('house', [1])]
[('the', [0, 3]), (',', [2]), ('beer', [4]), ('house', [1])]
hihe_func: 11.2509949207
grib_func: 18.3761279583
"""
commented: thanks fortiming this +14
commented: thanks +6