osdir.com


[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Extract sentences in nested parentheses using Python


I am trying to extract all strings in nested parentheses (along with the parentheses itself) in my .txt file. Please see the sample .txt file that I have used in this example here: (https://drive.google.com/open?id=1UKc0ZgY9Fsz5O1rSeBCLqt5dwZkMaQgr).

I have tried and done up three different codes but none of them seems to be able to extract all the nested parentheses. They can only extract a portion of the nested parentheses. Any advice on what I've done wrong could really help!

Here are the three codes I have done so far:

1st attempt:

import re
from os.path import join

def balanced_braces(args):
    parts = []
    for arg in args:
        if '(' not in arg:
            continue
        chars = []
        n = 0
        for c in arg:
            if c == '(':
                if n > 0:
                    chars.append(c)
                n += 1
            elif c == ')':
                n -= 1
                if n > 0:
                    chars.append(c)
                elif n == 0:
                    parts.append(''.join(chars).lstrip().rstrip())
                    chars = []
            elif n > 0:
                chars.append(c)
    return parts

with open('lan sample text file.txt','r') as fd:
    #for words in fd.readlines():       
    t1 = balanced_braces(fd);
    print(t1)


Output:

['"xE\'", PUT(xx.xxxx.),"\'"', '"TRUuuuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.", '"xE\'", PUT(xx.xxxx.),"\'"', '"CUuuiiiiuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]



2nd attempt:

from pyparsing import nestedExpr

matchedParens = nestedExpr('(',')')
with open('lan sample text file.txt','r') as fd:
    for words in fd.readlines():
        for e in matchedParens.searchString(words):
            print(e)


Output:

[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"TRUuuuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]
[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"CUuuiiiiuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]



3rd attempt:

def parse_segments(source, recurse=False):

    unmatched_count = 0
    start_pos = 0
    opened = False
    open_pos = 0
    cur_pos = 0

    finished = []
    segments = []

    for character in source:
        #scan for mismatched parenthesis:
        if character == '(':
            unmatched_count += 1
            if not opened:
                open_pos = cur_pos
            opened = True

        if character == ')':
            unmatched_count -= 1

        if opened and unmatched_count == 0:
            segment = source[open_pos:cur_pos+1]
            segments.append(segment)
            clean = source[start_pos:open_pos]
            if clean:
                finished.append(clean)
            opened = False
            start_pos = cur_pos+1

        cur_pos += 1

   # assert unmatched_count == 0

    if start_pos != cur_pos:
        #get anything that was left over here
        finished.append(source[start_pos:cur_pos])

    #now check on recursion:
    for item in segments:
        #get rid of bounding parentheses:
        pruned = item[1:-1]
        if recurse:
            results = parse_tags(pruned, recurse)
            finished.expand(results)
        else:
            finished.append(pruned)

    return finished

with open('lan sample text file.txt','r') as fd:
    for words in fd.readlines():
        t = parse_segments(words)
        print(t)


Output:

['kkkkk;\n']
['\n']
['  select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"TRUuuuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(\n']
['SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
['      ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );\n']
['\n']
['\n']
['jjjjjj;\n']
['\n']
['  select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"CUuuiiiiuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
['      ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );']




My intended Output that I am unable to get should look something like this:


("xE'", PUT(xx.xxxx.),"'")
("TRUuuuth")
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
 )
("xE'", PUT(xx.xxxx.),"'")
("CUuuiiiiuth")
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
 )