Building argv from command line

Gribouillis 1 Tallied Votes 853 Views Share

Some time ago, I was writing a small command line interpreter, with the help of the standard module cmd which offers minimal support for such tasks, and I had the problem that this module doesn't have a function to parse a command line to produce a list argv which can be passed to optparse for example.
I found a first solution with the third party module pygments which contains parsers for different languages, and I parsed the command line using pygment's bash parser.
However, I was not completely happy with this solution and I started looking for the C function wich builds argv for C programs. I finally found such a function in GNU libiberty library which is used by the gcc compiler.
This function was simple enough and I decided to write a pure python implementation of this function, using python's regular expressions and following closely the syntax rules used in libiberty's buildargv function (a 100% compatibility is not at all guaranteed)
This snippet is the result of this coding. The class Argv, which subclasses list, contains methods to transform a command line into a list of arguments with a behaviour similar to that of a C compiler. It also contains methods to write the argument list to a response file and read files to extract command arguments.

Here is the code, enjoy :)

# argv.py
"""
    This module implements a class Argv which parses a command line
    to produce a list of arguments similar to the arguments passed
    to a program called from a command shell.

    Note that this class doesn't handle arguments expansion made by
    a specific shell (for example bash replaces $HOME by the path
    to your home directory before passing the command line to the
    program).

    An effort was made to follow as closely as possible the algorithm
    used in GNU libiberty's buildargv function, which is used by the
    gnu C compiler for example.

    Other functions are provided as method which behaviour imitates
    functions of libiberty.

    * Written by Gribouillis for the python forum at Daniweb.com.
    Use this code freely, copy it, modify it, redistribute it.
"""

__all__ = [
    "Argv", "buildargv", "writeargv",
    "expandargv", "dupargv", "freeargv"
]

import re
import string

def end_of_string(eos):
    return "(?P<{eos}>$)".format(eos=eos)

def one_or_more(pat):
  return r"(?:{pat})+" .format(pat=pat)

def zero_or_more(pat):
  return r"(?:{pat})*" .format(pat=pat)

def one_of(*pats):
  return "(?:{pats})".format(pats = "|".join(pats))

def all_of(*pats):
    return "".join(pats)

escaped_char = r"\\."
escaped_opt = r"\\.?"
non_space_or_quote = r"[^{0}\'\"]".format(repr(string.whitespace))
one_space_eoarg = r"(?P<eoarg>[{0}])".format(repr(string.whitespace))
s_quote = r"[\']"
d_quote = r'[\"]'
special_re = re.compile(r"[{0}\'\"\\]".format(repr(string.whitespace)))

def non_quote(quote):
    return r"[^" + quote[1:]

def quoted(quote, eos):
    return all_of(
        quote,
        zero_or_more(one_of(escaped_char, non_quote(quote))),
        one_of(end_of_string(eos), quote)
    )

item_regex = one_of(
    one_or_more(
        one_of(escaped_char, non_space_or_quote),
    ),
    quoted(s_quote, "eoss"),
    quoted(d_quote, "eosd"),
    one_space_eoarg,
)

item_re = re.compile(item_regex)
escaped_re = re.compile(escaped_opt)

class Argv(list):
    def __init__(self, *args):
        list.__init__(self, *args)

    def _repl_func(self, mo):
        if mo.group("eoarg") is not None:
            s = ''.join(self[-1])
            self[-1] = s
            self.append([])
        else:
            s = mo.group(0)
            if s[0] == "'":
                self[-1].append(s[1: -1 if mo.group("eoss") is None else len(s)])
            elif s[0] == '"':
                self[-1].append(s[1: -1 if mo.group("eosd") is None else len(s)])
            else:
                self[-1].append(s)

    @classmethod
    def build(cls, command_line):
        """Argv.build(command_line) --> a new Argv object containing
        arguments extracted from the command line"""
        self = cls([[]])
        item_re.sub(self._repl_func, command_line)
        s = ''.join(self[-1])
        self[-1] = s
        items = (escaped_re.sub(lambda x: x.group(0)[1:], s) for s in self)
        self [:] = [s for s in items if s]
        return self

    def write(self, fileobj):
        """Write the argv to a file object, with one argument per line"""
        for arg in self:
            s = special_re.sub(lambda m: "\\" + m.group(0), arg)
            fileobj.write(s)
            fileobj.write("\n")

    def expand(self):
        """Expand argv: each argument starting with @ is supposed to
        be the path to a 'response' file containing other arguments.
        If possible, this file is opened and the arguments read
        in the file are inserted into self."""
        result = self.__class__()
        for arg in self:
            if arg.startswith("@"):
                try:
                    s = open(arg[1:]).read()
                except OSError:
                    continue
                result.extend(Argv.build(s).expand())
            else:
                result.append(arg)
        return result

    def free(self):
        """Empties the Argv object"""
        self[:] = []

    def dup(self):
        return self.__class__(self)

# a few wrapper functions to provide an interface similar to
# gnu libiberty's argv interface

def buildargv(line):
    return Argv.build(line)

def writeargv(argv, fileobj):
    argv.write(fileobj)

def expandargv(argv):
    return argv.expand()

def dupargv(argv):
    return argv.dup()

def freeargv(argv):
    argv.free()


# A simple test function
# we use the same test command lines as argv.c in gnu libiberty.

def tests():
    lines = [
        "a simple command line",
    	"arg 'foo' is single quoted",
    	"arg \"bar\" is double quoted",
    	"arg \"foo bar\" has embedded whitespace",
	    "arg 'Jack said \\'hi\\'' has single quotes",
    	"arg 'Jack said \\\"hi\\\"' has double quotes",
    	"a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9",
	    # This should be expanded into only one argument.
    	"trailing-whitespace ",
	    "",
    ]
    for line in lines:
        print(line)
        print(Argv.build(line))
        print

if __name__ == "__main__":
    tests()

""" test code output --->
a simple command line
['a', 'simple', 'command', 'line']

arg 'foo' is single quoted
['arg', 'foo', 'is', 'single', 'quoted']

arg "bar" is double quoted
['arg', 'bar', 'is', 'double', 'quoted']

arg "foo bar" has embedded whitespace
['arg', 'foo bar', 'has', 'embedded', 'whitespace']

arg 'Jack said \'hi\'' has single quotes
['arg', "Jack said 'hi'", 'has', 'single', 'quotes']

arg 'Jack said \"hi\"' has double quotes
['arg', 'Jack said "hi"', 'has', 'double', 'quotes']

a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4', '5', '6', '7', '8', '9']

trailing-whitespace 
['trailing-whitespace']


[]
"""
Gribouillis 1,391 Programming Explorer Team Colleague

I realized that the above code wouldn't run with python <= 2.5 because I used the string format method, so here is a modified version which was tested with 2.5, 2.6 and 3.1.

# argv.py
"""
    This module implements a class Argv which parses a command line
    to produce a list of arguments similar to the arguments passed
    to a program called from a command shell.

    Note that this class doesn't handle arguments expansion made by
    a specific shell (for example bash replaces $HOME by the path
    to your home directory before passing the command line to the
    program).

    An effort was made to follow as closely as possible the algorithm
    used in GNU libiberty's buildargv function, which is used by the
    gnu C compiler for example.

    Other functions are provided as method which behaviour imitates
    functions of libiberty.

    * Written by Gribouillis for the python forum at Daniweb.com.
    Use this code freely, copy it, modify it, redistribute it.
"""

__all__ = [
    "Argv", "buildargv", "writeargv",
    "expandargv", "dupargv", "freeargv"
]

import re
import string

def end_of_string(eos):
    return "(?P<%s>$)" % eos

def one_or_more(pat):
  return r"(?:%s)+"  % pat

def zero_or_more(pat):
  return r"(?:%s)*" % pat

def one_of(*pats):
  return "(?:%s)" % ("|".join(pats))

def all_of(*pats):
    return "".join(pats)

escaped_char = r"\\."
escaped_opt = r"\\.?"
non_space_or_quote = r"[^%s\'\"]" % repr(string.whitespace)
one_space_eoarg = r"(?P<eoarg>[%s])" % repr(string.whitespace)
s_quote = r"[\']"
d_quote = r'[\"]'
special_re = re.compile(r"[%s\'\"\\]" % repr(string.whitespace))

def non_quote(quote):
    return r"[^" + quote[1:]

def quoted(quote, eos):
    return all_of(
        quote,
        zero_or_more(one_of(escaped_char, non_quote(quote))),
        one_of(end_of_string(eos), quote)
    )

item_regex = one_of(
    one_or_more(
        one_of(escaped_char, non_space_or_quote),
    ),
    quoted(s_quote, "eoss"),
    quoted(d_quote, "eosd"),
    one_space_eoarg,
)

item_re = re.compile(item_regex)
escaped_re = re.compile(escaped_opt)

class Argv(list):
    def __init__(self, *args):
        list.__init__(self, *args)

    def _repl_func(self, mo):
        if mo.group("eoarg") is not None:
            s = ''.join(self[-1])
            self[-1] = s
            self.append([])
        else:
            s = mo.group(0)
            if s[0] == "'":
                self[-1].append(s[1: -1 if mo.group("eoss") is None else len(s)])
            elif s[0] == '"':
                self[-1].append(s[1: -1 if mo.group("eosd") is None else len(s)])
            else:
                self[-1].append(s)

    @classmethod
    def build(cls, command_line):
        """Argv.build(command_line) --> a new Argv object containing
        arguments extracted from the command line"""
        self = cls([[]])
        item_re.sub(self._repl_func, command_line)
        s = ''.join(self[-1])
        self[-1] = s
        items = (escaped_re.sub(lambda x: x.group(0)[1:], s) for s in self)
        self [:] = [s for s in items if s]
        return self

    def write(self, fileobj):
        """Write the argv to a file object, with one argument per line"""
        for arg in self:
            s = special_re.sub(lambda m: "\\" + m.group(0), arg)
            fileobj.write(s)
            fileobj.write("\n")

    def expand(self):
        """Expand argv: each argument starting with @ is supposed to
        be the path to a 'response' file containing other arguments.
        If possible, this file is opened and the arguments read
        in the file are inserted into self."""
        result = self.__class__()
        for arg in self:
            if arg.startswith("@"):
                try:
                    s = open(arg[1:]).read()
                except OSError:
                    continue
                result.extend(Argv.build(s).expand())
            else:
                result.append(arg)
        return result

    def free(self):
        """Empties the Argv object"""
        self[:] = []

    def dup(self):
        return self.__class__(self)

# a few wrapper functions to provide an interface similar to
# gnu libiberty's argv interface

def buildargv(line):
    return Argv.build(line)

def writeargv(argv, fileobj):
    argv.write(fileobj)

def expandargv(argv):
    return argv.expand()

def dupargv(argv):
    return argv.dup()

def freeargv(argv):
    argv.free()


# A simple test function
# we use the same test command lines as argv.c in gnu libiberty.

def tests():
    lines = [
        "a simple command line",
    	"arg 'foo' is single quoted",
    	"arg \"bar\" is double quoted",
    	"arg \"foo bar\" has embedded whitespace",
	    "arg 'Jack said \\'hi\\'' has single quotes",
    	"arg 'Jack said \\\"hi\\\"' has double quotes",
    	"a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9",
	    # This should be expanded into only one argument.
    	"trailing-whitespace ",
	    "",
    ]
    for line in lines:
        print(line)
        print(Argv.build(line))
        print("")

if __name__ == "__main__":
    tests()
Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.