Hi everybody,

I've got a code which returns to a given text an inverse index. From a list of tokens, the function produces a list sorted by the frequency.

Example:
inverted_index(['the', 'house', ',', 'the', 'beer'])
[('the', [0, 3]), ('beer', [4]), ('house', [1]), (',', [2])]

Code:

def outp(txt):
    ind = {}
    for word in txt:
        if word not in ind.keys():
            i = txt.index(word)
            ind[word] = [i]
        else:
            i = txt.index(word, ind[word][-1]+1)
            ind[word].append(i)
        sorted_ind = sorted(ind.items(), key=lsort, reverse=True)
    return sorted_ind




def lsort(kv):
    return len(kv[1])

The code works, but it's very slow.
So, my question is: How could it be written s.t. the code is faster?

Thanks for any propositions, Darek

Edited 4 Years Ago by Darek6

If you need to keep indexes of the words, itertools.groupby might be more useful than Counter.

This might be faster:

# create and word:index dictionary and a sorted list of index tuples

def create_index_dict(data_list):
    index_dict = {}
    for ix, word in enumerate(data_list):
        index_dict.setdefault(word, []).append(ix)
    return index_dict


data_list = ['the', 'house', ',', 'the', 'beer']
index_dict = create_index_dict(data_list)
print(index_dict)

'''
{'house': [1], 'the': [0, 3], 'beer': [4], ',': [2]}
'''

index_list = [(k, v) for k, v in index_dict.items()]
print(sorted(index_list, reverse=True))

'''
[('the', [0, 3]), ('house', [1]), ('beer', [4]), (',', [2])]
'''
Comments
fast code

I tried to do it using itertools.groupby(), but hihe's method is faster. The reason is probably that groupby() needs an initial sort, while filling a dict doesn't. Here is the comparison

#!/usr/bin/env python
# -*-coding: utf8-*-
# compare two implementations of creating a sorted index list

data_list = ['the', 'house', ',', 'the', 'beer']

from itertools import count, groupby
from operator import itemgetter
ig0 = itemgetter(0)
ig1 = itemgetter(1)

def score(item):
    return (-len(item[1]), item[0])

def grib_func(data_seq):
    L = sorted(zip(data_seq, count(0)))
    L = ((key, [x[1] for x in group]) for key, group in groupby(L, key = ig0))
    return sorted(L, key = score)

# hihe's code    
def create_index_dict(data_list):
    index_dict = {}
    for ix, word in enumerate(data_list):
        index_dict.setdefault(word, []).append(ix)
    return index_dict

def hihe_func(data_seq):
    return sorted(create_index_dict(data_seq).items(), key = score)

# comparison code

print grib_func(data_list)
print hihe_func(data_list)

from timeit import Timer
for name in ("hihe_func", "grib_func"):
    tm = Timer("%s(data_list)"%name, "from __main__ import %s, data_list"%name)
    print "{0}: {1}".format(name, tm.timeit())

""" my output -->
[('the', [0, 3]), (',', [2]), ('beer', [4]), ('house', [1])]
[('the', [0, 3]), (',', [2]), ('beer', [4]), ('house', [1])]
hihe_func: 11.2509949207
grib_func: 18.3761279583
"""

Edited 4 Years Ago by Gribouillis

Comments
thanks fortiming this
thanks
This question has already been answered. Start a new discussion instead.