Grouping strings by prefixes

Gribouillis 0 Tallied Votes 686 Views Share

This snippet defines a class which can group a collection of strings according to a given set of prefixes. Each string goes in the group of the longest prefix it contains.

#!/usr/bin/env python

# Gribouillis at www.daniweb.com
"""
The class Grouper implements grouping strings by prefixes.

A Grouper object is created with a sequence of prefixes
G = Grouper(["pref1", "pref2", ...])

(an empty prefix is automatically added). The Grouper object
is a dictionary "prefix --> set of strings". Initially, it looks
like
  {
     "pref1" : set([]),
     "pref2" : set([]),
  }

The method 'add_strings' can be used to add a sequence of strings
to the grouper. Each string goes in one of the sets according to
the longest prefix it contains.
After
  G.add_strings(["pref1aaaaa", "pref2jjj", "pref1s"])

it's content will be

  {
    "pref1" : set(["pref1aaaaa", "pref1s"]),
    "pref2" : set(["pref2jjj"]),
  }

More strings can be added.
"""

from collections import deque 
from bisect import bisect_left ,bisect_right 

class Grouper (dict ):
  def __init__ (self ,iterable ):
    prefixes =set (iterable )
    prefixes .add ("")
    dict .__init__ (self ,[(p ,set ())for p in prefixes ])
    self .skeys =[]

  def add_strings (self ,iterable ):
    strings =sorted (set (iterable ))
    if len (self .skeys )!=len (self ):
      self .skeys =sorted (self )[1 :]
    stack =deque ([("",chr (255 ))])
    k =0 
    for p in self .skeys :
      while stack [-1 ][1 ]<=p :
        q ,qs =stack .pop ()
        k =self ._forward (q ,k ,qs ,strings )
      k =self ._forward (stack [-1 ][0 ],k ,p ,strings )
      stack .append ((p ,p [:-1 ]+chr (ord (p [-1 ])+1 )))
    while stack :
      q ,qs =stack .pop ()
      k =self ._forward (q ,k ,qs ,strings )

  def _forward (self ,q ,k ,qs ,strings ):
    ks =bisect_left (strings ,qs ,k )
    self [q ].update (strings [i ]for i in xrange (k ,ks ))
    return ks 

if __name__ =="__main__":
  grouper =Grouper (["/usr","/usr/lib","hello","heat"])
  grouper .add_strings ([
  "/usr/bin/python","/usr/lib/python2.5","/dev",
  "hello world","hello","heat foo","heat","her",
  "/usr/liberation",
  ])
  for k ,s in grouper .items ():
    print ("%s: %s"%(repr (k ),str (s )))
  print "Adding more strings"
  grouper .add_strings ([
  "hello fred","/usrssss",
  ])
  for k ,s in grouper .items ():
    print ("%s: %s"%(repr (k ),str (s )))