I have a text file in the form like below

sourceFiles {
        hello.cpp,
        hi.cpp,
        main.cpp,
    }

    headerFiles {
        hello.h,
        hi.h,
    }

    path {
       source [
                what,
                how,
                which,
            ]

       header [
                include,
                inc,
                head,
            ]
    }

I need to populate several lists from this text file. So far I have done this

sourcePath = None
headerPath = None

regex = r'sourceFiles\s*\{(.*?)\}'
match = re.findall(regex, data, re.DOTALL | re.IGNORECASE)
sources = [re.findall(r'\w+.\w+', each) for each in match][0]

regex = r'headerFiles\s*\{(.*?)\}'
match = re.findall(regex, data, re.DOTALL | re.IGNORECASE)
headers = [re.findall(r'\w+.\w+', each) for each in match][0]

regex = r'path\s*\{(.*?)\}'
match = re.findall(regex, data, re.DOTALL | re.IGNORECASE)
paths = [re.findall(r'\[.*?\]', each, re.DOTALL) for each in match][0]

if paths:
    temp = re.findall(r'\[(.+?)\]', paths[0], re.DOTALL)
    sourcePath = [re.findall(r'(\w+)\,', each) for each in temp][0]
    temp = re.findall(r'\[(.+?)\]', paths[1], re.DOTALL)
    headerPath = [re.findall(r'(\w+)\,', each) for each in temp][0]
print sources
print headers
print sourcePath
print headerPath

But this has several problems. One of the problem is that if I don't provide a , after all the entries, it does not give me any exception. I want that if I don't provide , after every entry, it would generate an exception. Moreover I want much stronger syntax checking of this file. Please help me rewriting it.

Recommended Answers

All 2 Replies

A reasonable way to start good parsing is to write a tokenizer

from __future__ import print_function
import re
from collections import namedtuple

regex = re.compile(r'(\w+[.]\w+)|(\w+|[{}\[\],])|(\n)')
d1 = {
    'sourceFiles': 'SOURCEFILES',
    'headerFiles': 'HEADERFILES',
    'path': 'PATH',
    'header': 'HEADER',
    'source': 'SOURCE',
    '{': 'LB', '}': 'RB', '[': 'LS', ']': 'RS',
    ',': 'COMMA',
}

#Token = namedtuple('Token', 'type data lineno column')
Token = lambda *args: args

def generate_tokens(text):
    idx = 0
    lineno = 1
    column = 1
    for match in regex.finditer(text):
        if text[idx:match.start()].strip():
            t = text[idx:match.start()]
            column += len(t) - len(t.lstrip())
            raise RuntimeError('Invalid token at line {} column {}'.format(lineno, column))
        column += match.start() - idx
        if match.group(2):
            tp = d1.get(match.group(2), 'WORD')
            yield Token(tp, match.group(2), lineno, column)
            column += match.end() - match.start()
        elif match.group(3):
            lineno += 1
            column = 1
        else:
            assert match.group(1)
            yield Token('FILENAME', match.group(1), lineno, column)
            column += match.end() - match.start()
        idx = match.end()
    if text[idx:].strip():
        raise RuntimeError('Invalid token at line {}'.format(lineno))

example = """    
sourceFiles {
        hello.cpp,
        hi.cpp,
        main.cpp,
    }

    headerFiles {
        hello.h,
        hi.h,
    }

    path {
       source [
                what,
                how,
                which,
            ]

       header [
                include,
                inc,
                head,
            ]
    }
"""

if __name__ == '__main__':
    input =  list(generate_tokens(example))
    print(input)

""" my output -->
[('SOURCEFILES', 'sourceFiles', 2, 1), ('LB', '{', 2, 13), ('FILENAME', 'hello.cpp', 3, 9), ('COMMA', ',', 3, 18), ('FILENAME', 'hi.cpp', 4, 9), ('COMMA', ',', 4, 15), ('FILENAME', 'main.cpp', 5, 9), ('COMMA', ',', 5, 17), ('RB', '}', 6, 5), ('HEADERFILES', 'headerFiles', 8, 5), ('LB', '{', 8, 17), ('FILENAME', 'hello.h', 9, 9), ('COMMA', ',', 9, 16), ('FILENAME', 'hi.h', 10, 9), ('COMMA', ',', 10, 13), ('RB', '}', 11, 5), ('PATH', 'path', 13, 5), ('LB', '{', 13, 10), ('SOURCE', 'source', 14, 8), ('LS', '[', 14, 15), ('WORD', 'what', 15, 17), ('COMMA', ',', 15, 21), ('WORD', 'how', 16, 17), ('COMMA', ',', 16, 20), ('WORD', 'which', 17, 17), ('COMMA', ',', 17, 22), ('RS', ']', 18, 13), ('HEADER', 'header', 20, 8), ('LS', '[', 20, 15), ('WORD', 'include', 21, 17), ('COMMA', ',', 21, 24), ('WORD', 'inc', 22, 17), ('COMMA', ',', 22, 20), ('WORD', 'head', 23, 17), ('COMMA', ',', 23, 21), ('RS', ']', 24, 13), ('RB', '}', 25, 5)]
"""

Great information thanks for sharing!

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.