|
[rfc] regex rewrite: msg#00021lang.smalltalk.gnu.general
This patch is actually bigger than I expected. :-) Basically, matching with #searchRegex:from:to:, #searchRegex:startingAt: and #searchRegex: now returns a full fledged RegexResults object or nil if the matching did not succeed. I have not provided yet the 'ifAbsent:' variants. If you do not care about the subexpressions, use #indexOfRegex: and friends, which will return an Interval as before. Most methods previously did the matching twice, in practice. First, they found the starting position of the match; then, they called re_match to find the length of the match. This sucks, obviously, so I rewrote all the methods. I am using the internal #searchRegexInternal:from:to: method (which is what Mike called #searchRegexFull:from:to:) from all the methods. I could have used #indexOfRegex: as well, but this avoids the overhead of creating an Interval. There is a contextual modification to Interval, making it more compatible with VisualWorks as well. Interval can return a #first and #last even if the Interval is empty. These are the start and stop object that it was created with. The private methods #start, #stop and #step have thus been superseded by #first, #last and #increment. You'll note that this patch already uses the new pragma-like syntax for C call-outs. Unfortunately the arch repository is down currently, I'll have to write to the guys at sourcecontrol.net. I was about to make the #copyReplacingRegex:with: methods use #bindWithArguments: but I decided that this patch was already big enough. What do you think about this additional change? Now, all that remains before this stuff can be made ``more'' official, is writing some SUnit tests... I'll get round to it, but if somebody wants to help, patches are welcome as usual. Paolo 2005-05-27 Paolo Bonzini <bonzini@xxxxxxx> Mike Anderson <msasjf@xxxxxxxxxxx> * kernel/Interval.st: Add #first, #last, #increment. * examples/re.c (): Return a struct pre_registers * from reh_search. New reh_free_registers function, register it. * examples/regex.st: New class CRegexRegisters, modify all the methods to use it. Name the C call-out method #searchRegexInternal:from:to:. New class RegexResults, return it from #searchRegex:from:to:, #searchRegex:startingAt:, #searchRegex:. --- orig/examples/re.c +++ mod/examples/re.c @@ -71,8 +71,10 @@ static void markRegexAsMRU (int i); /* Functions exported to Smalltalk */ static OOP reh_make_cacheable (OOP patternOOP); -static int reh_search (OOP srcOOP, OOP patternOOP, int from, int to), -reh_match (OOP srcOOP, OOP patternOOP, int from, int to); +static struct pre_registers *reh_search (OOP srcOOP, OOP patternOOP, + int from, int to); +static int reh_match (OOP srcOOP, OOP patternOOP, int from, int to); +static void reh_free_registers(struct pre_registers *regs); static RegexCacheEntry cache[REGEX_CACHE_SIZE]; @@ -230,26 +232,35 @@ reh_make_cacheable (OOP patternOOP) } /* Search helper function */ -int +struct pre_registers * reh_search (OOP srcOOP, OOP patternOOP, int from, int to) { int res = 0; const char *src; struct pre_pattern_buffer *regex; + struct pre_registers *regs; RegexCaching caching; caching = lookupRegex (patternOOP, ®ex); if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL) - return -100; + return NULL; /* now search */ src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1); - res = pre_search (regex, src, to, from - 1, to - from + 1, NULL); + regs = (struct pre_registers *) calloc (1, sizeof (struct pre_registers)); + res = pre_search (regex, src, to, from - 1, to - from + 1, regs); if (caching == REGEX_NOT_CACHED) pre_free_pattern (regex); - return res + 1; + return regs; +} + +void +reh_free_registers(struct pre_registers *regs) +{ + pre_free_registers(regs); + free(regs); } /* Match helper function */ @@ -281,6 +292,7 @@ gst_initModule (VMProxy * proxy) vmProxy = proxy; vmProxy->defineCFunc ("reh_search", reh_search); vmProxy->defineCFunc ("reh_match", reh_match); + vmProxy->defineCFunc ("reh_free_registers", reh_free_registers); vmProxy->defineCFunc ("reh_make_cacheable", reh_make_cacheable); regexClass = vmProxy->classNameToOOP ("Regex"); --- orig/examples/regex.st +++ mod/examples/regex.st @@ -9,7 +9,7 @@ "====================================================================== | | Copyright 2001, 2003, 2005 Free Software Foundation, Inc. -| Written by Dragomir Milivojevic. +| Written by Dragomir Milevojevic, Paolo Bonzini, Mike Anderson. | | This file is part of the GNU Smalltalk class library. | @@ -50,6 +50,91 @@ them outside the loop, but special care object is used whenever possible (when converting Strings to Regex, the cache is sought for an equivalent, already constructed Regex).'. +CStruct subclass: #CRegexRegisters + declaration: #( (#allocated #int) + (#numRegs #int) + (#beg (#ptr #int)) + (#end (#ptr #int)) ) + classVariableNames: '' + poolDictionaries: '' + category: 'Regex' +! + +Object subclass: #RegexResults + instanceVariableNames: 'subject from to registers match cache' + classVariableNames: '' + poolDictionaries: '' + category: 'Regex' +! + +!RegexResults methodsFor: 'accessing'! + +subject + ^subject! + +size + ^registers size! + +from + ^from! + +fromAt: anIndex + | reg | + anIndex = 0 ifTrue: [ ^from ]. + reg := registers at: anIndex. + ^reg isNil ifTrue: [ nil ] ifFalse: [ reg first ]! + +to + ^to! + +toAt: anIndex + | reg | + anIndex = 0 ifTrue: [ ^from ]. + reg := registers at: anIndex. + ^reg isNil ifTrue: [ nil ] ifFalse: [ reg last ]! + +match + match isNil + ifTrue: [ match := self subject copyFrom: from to: to ]. + ^match! + +matchInterval + ^from to: to! + +at: anIndex + | reg text | + anIndex = 0 ifTrue: [ ^self match ]. + (cache at: anIndex) isNil + ifTrue: [ + reg := registers at: anIndex. + text := reg isNil + ifTrue: [ nil ] + ifFalse: [ self subject copyFrom: reg first to: reg last ]. + cache at: anIndex put: text ]. + ^cache at: anIndex! + +intervalAt: anIndex + ^anIndex = 0 + ifTrue: [ from to: to ] + ifFalse: [ registers at: anIndex ]! + +size + ^registers size! ! + +!RegexResults methodsFor: 'private'! + +initialize: regs subject: aString + from := regs matchBeg. + to := regs matchEnd. + registers := (1 to: regs numRegs value - 1) collect: [ :i | + | beg end | + beg := (regs begAt: i). + end := (regs endAt: i). + end < 0 ifTrue: [ nil ] ifFalse: [ beg to: end ] ]. + cache := Array new: registers size. + subject := aString! + + " --- external function definitions --- " !Regex class methodsFor: 'C call-outs'! @@ -59,14 +144,38 @@ fromString: aString !String methodsFor: 'C call-outs'! -searchRegex: pattern from: from to: to - <cCall: 'reh_search' returning: #int - args: #(#selfSmalltalk #smalltalk #int #int)>! - lengthOfRegexMatch: pattern from: from to: to <cCall: 'reh_match' returning: #int + args: #(#selfSmalltalk #smalltalk #int #int)>! + +searchRegexInternal: pattern from: from to: to + <cCall: 'reh_search' returning: CRegexRegisters type args: #(#selfSmalltalk #smalltalk #int #int)>! ! +!CRegexRegisters methodsFor: 'C call-outs'! + +begAt: i + ^(self beg value + i) value + 1! + +matchBeg + | begValue | + begValue := self beg value. + begValue isNil ifTrue: [ ^-1 ]. + ^begValue value + 1! + +endAt: i + ^(self end value + i) value! + +matchEnd + | endValue | + endValue := self end value. + endValue isNil ifTrue: [ ^-1 ]. + ^endValue value! + +free + <cCall: 'reh_free_registers' returning: #void + args: #(#self)>! ! + "--------------------------------------------------------------------------" !Regex class methodsFor: 'instance creation'! @@ -138,7 +247,11 @@ printOn: aStream =~ regexString "Answer whether an occurrence of the regex is present in the receiver" - ^(self searchRegex: regexString from: 1 to: self size) > 0 + | regs gotIt | + regs := self searchRegexInternal: regexString from: 1 to: self size. + gotIt := regs beg value notNil. + regs free. + ^gotIt ! asRegex @@ -146,60 +259,92 @@ asRegex ^Regex fromString: self ! +searchRegex: pattern + | regs | + regs := self searchRegexInternal: pattern from: 1 to: self size. + regs matchBeg = -1 ifTrue: [ regs free. ^nil ]. + ^[ RegexResults new initialize: regs subject: self ] + ensure: [ regs free ]! + +searchRegex: pattern startingAt: anIndex + | regs | + regs := self searchRegexInternal: pattern from: anIndex to: self size. + regs matchBeg = -1 ifTrue: [ regs free. ^nil ]. + ^[ RegexResults new initialize: regs subject: self ] + ensure: [ regs free ]! + +searchRegex: pattern from: from to: to + | regs | + regs := self searchRegexInternal: pattern from: from to: to. + regs matchBeg = -1 ifTrue: [ regs free. ^nil ]. + ^[ RegexResults new initialize: regs subject: self ] + ensure: [ regs free ]! + indexOfRegex: regexString ifAbsent: excBlock "Answer whether an occurrence of the regex is present in the receiver" - | start len | - start := self searchRegex: regexString from: 1 to: self size. - start > 0 ifFalse: [ ^excBlock value ]. - - len := self lengthOfRegexMatch: regexString from: start to: self size. - ^start to: start + len - 1 + | regs beg end | + regs := self searchRegexInternal: regexString from: 1 to: self size. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + ^beg >= 1 + ifTrue: [ beg to: end ] + ifFalse: [ excBlock value ] ! indexOfRegex: regexString startingAt: index ifAbsent: excBlock - | start len | - start := self searchRegex: regexString from: index to: self size. - start > 0 ifFalse: [ ^excBlock value ]. - - len := self lengthOfRegexMatch: regexString from: start to: self size. - ^start to: start + len - 1 -! - -indexOfRegex: regexString from: start to: end ifAbsent: excBlock - | idx len | - idx := self searchRegex: regexString from: idx to: end. - idx > 0 ifFalse: [ ^excBlock value ]. - - len := self lengthOfRegexMatch: regexString from: idx to: self size. - ^start to: start + len - 1 + | regs beg end | + regs := self searchRegexInternal: regexString from: index to: self size. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + ^beg >= 1 + ifTrue: [ beg to: end ] + ifFalse: [ excBlock value ] +! + +indexOfRegex: regexString from: from to: to ifAbsent: excBlock + | regs beg end | + regs := self searchRegexInternal: regexString from: from to: to. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + ^beg >= 1 + ifTrue: [ beg to: end ] + ifFalse: [ excBlock value ] ! indexOfRegex: regexString - "Answer whether an occurrence of the regex is present in the receiver" - | start len | - start := self searchRegex: regexString from: 1 to: self size. - start > 0 ifFalse: [ ^nil ]. - - len := self lengthOfRegexMatch: regexString from: start to: self size. - ^start to: start + len - 1 + | regs beg end | + regs := self searchRegexInternal: regexString from: 1 to: self size. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + ^beg >= 1 + ifTrue: [ beg to: end ] + ifFalse: [ nil ] ! indexOfRegex: regexString startingAt: index - | start len | - start := self searchRegex: regexString from: index to: self size. - start > 0 ifFalse: [ ^nil ]. - - len := self lengthOfRegexMatch: regexString from: start to: self size. - ^start to: start + len - 1 -! - -indexOfRegex: regexString from: start to: end - | idx len | - idx := self searchRegex: regexString from: idx to: end. - idx > 0 ifFalse: [ ^nil ]. - - len := self lengthOfRegexMatch: regexString from: start to: self size. - ^start to: start + len - 1 + | regs beg end | + regs := self searchRegexInternal: regexString from: index to: self size. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + ^beg >= 1 + ifTrue: [ beg to: end ] + ifFalse: [ nil ] +! + +indexOfRegex: regexString from: from to: to + | regs beg end | + regs := self searchRegexInternal: regexString from: from to: to. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + ^beg >= 1 + ifTrue: [ beg to: end ] + ifFalse: [ nil ] ! matchRegex: pattern @@ -217,20 +362,18 @@ matchRegex: pattern from: from to: to occurrencesOfRegex: pattern from: from to: to "Returns count of how many times pattern repeats in string" - | res idx len regex | + | res idx regex beg end regs | regex := pattern asRegex. res := 0. idx := from. [ - idx <= to and: [ - idx := self searchRegex: regex from: idx to: to. - idx > 0 ] + regs := self searchRegexInternal: regex from: idx to: to. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + beg >= 1 ] whileTrue: [ - - len := self lengthOfRegexMatch: regex from: idx to: to. - len = 0 ifTrue: [ len := 1 ]. - - idx := idx + len. + idx := end max: beg + 1. res := res + 1. ]. @@ -252,23 +395,21 @@ occurrencesOfRegex: pattern copyFrom: from to: to replacingRegex: pattern with: str "Replaces first occurance of pattern with provided string" - | res idx len | + | res regs beg end | + regs := self searchRegexInternal: pattern from: from to: to. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + + beg >= 1 + ifTrue: [ + res := self species new: (to - from) - (end - beg) + str size. + res replaceFrom: 1 to: beg - from with: self startingAt: from. + res replaceFrom: beg - from + 1 to: beg - from + str size with: str. + res replaceFrom: beg - from + str size + 1 to: res size with: self startingAt: end - from + 2 ] + ifFalse: [ res := self copyFrom: from to: to ]. - idx := self searchRegex: pattern from: from to: to. - - idx > 0 ifTrue: [ - res := self copyFrom: from to: idx - 1. - res := res, str. - - idx := idx + (self lengthOfRegexMatch: pattern from: idx to: to). - - idx <= to ifTrue: - [ res := res, (self copyFrom: idx to: to) ]. - - ^ res - ]. - - ^self copyFrom: from to: to + ^res ! copyReplacingRegex: pattern with: str @@ -280,29 +421,24 @@ copyReplacingRegex: pattern with: str copyFrom: from to: to replacingAllRegex: pattern with: str "Replaces all occurances of pattern between boundaries with specified string" - | res oldIdx idx len regex | - idx := from. - res := WriteStream on: self copyEmpty. + | res idx regex beg end regs | regex := pattern asRegex. - + res := WriteStream on: (String new: to - from + 1). + idx := from. [ - oldIdx := idx. - idx <= to and: [ - idx := self searchRegex: regex from: idx to: to. - idx > 0 ] + regs := self searchRegexInternal: regex from: idx to: to. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + beg >= 1 ] whileTrue: [ - oldIdx to: idx - 1 do: [ :each | - res nextPut: (self at: each) ]. - - len := self lengthOfRegexMatch: regex from: idx to: to. - len = 0 ifTrue: [ len := 1 ]. - + res next: beg - idx putAll: self startingAt: idx. res nextPutAll: str. - idx := idx + len. + idx := end + 1. + beg > end ifTrue: [ res nextPut: (self at: idx). idx := idx + 1 ]. + idx > self size ifTrue: [ ^res contents ]. ]. - - oldIdx to: to do: [ :each | - res nextPut: (self at: each) ]. + res next: to - idx + 1 putAll: self startingAt: idx. ^res contents ! @@ -316,20 +452,18 @@ copyReplacingAllRegex: pattern with: str onOccurrencesOfRegex: pattern from: from to: to do: body "Searches for pattern and executed passed instruction-body (as a trigger)" - | idx len res regex | - idx := from. + | idx regex beg end regs | regex := pattern asRegex. - + idx := from. [ - idx <= to and: [ - idx := self searchRegex: regex from: idx to: to. - idx > 0 ] + regs := self searchRegexInternal: regex from: idx to: to. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + beg >= 1 ] whileTrue: [ - len := self lengthOfRegexMatch: regex from: idx to: to. - - body value: idx value: len. - len = 0 ifTrue: [ len := 1 ]. - idx := idx + len. + body value: beg value: end - beg + 1. + idx := end + 1 max: beg + 1. ]. ! @@ -340,28 +474,24 @@ onOccurrencesOfRegex: pattern do: body ! tokenize: pattern from: from to: to - - | res oldIdx idx len regex | - idx := from. - res := WriteStream on: (Array new: 10). + | res idx regex beg end regs tokStart | regex := pattern asRegex. - + res := WriteStream on: (Array new: 10). + idx := from. + tokStart := 1. [ - oldIdx := idx. - idx <= to and: [ - idx := self searchRegex: regex from: idx to: to. - idx > 0 ] + regs := self searchRegexInternal: regex from: idx to: to. + beg := regs matchBeg. + end := regs matchEnd. + regs free. + beg >= 1 ] whileTrue: [ - len := self lengthOfRegexMatch: regex from: idx to: to. - res nextPut: (self copyFrom: oldIdx to: idx - 1). - - len = 0 ifTrue: [ len := 1 ]. - idx := idx + len. + res nextPut: (self copyFrom: tokStart to: beg - 1). + tokStart := end + 1. + idx := beg + 1 max: end + 1. ]. - oldIdx <= to ifTrue: [ - res nextPut: (self copyFrom: oldIdx to: to) - ]. + res nextPut: (self copyFrom: tokStart to: to). ^res contents ! @@ -374,7 +504,6 @@ tokenize: pattern ! tokenize - ^self tokenize: '[\n\t ]+' from: 1 to: self size ! ! --- orig/kernel/Interval.st +++ mod/kernel/Interval.st @@ -8,7 +8,7 @@ "====================================================================== | -| Copyright 1988,92,94,95,99,2000,2001,2002 +| Copyright 1988,92,94,95,99,2000,2001,2002,2005 | Free Software Foundation, Inc. | Written by Steve Byrne. | @@ -219,6 +219,18 @@ printOn: aStream print: stop ]. aStream nextPut: $) +! + +first + ^first +! + +last + ^stop - ((stop - start) \\ step) +! + +increment + ^step ! ! @@ -252,17 +264,5 @@ initializeFrom: startInteger to: stopInt start := startInteger. stop := stopInteger. step := stepInteger -! - -start - ^start -! - -stop - ^stop -! - -step - ^step ! ! _______________________________________________ help-smalltalk mailing list help-smalltalk@xxxxxxx http://lists.gnu.org/mailman/listinfo/help-smalltalk |
|
| <Prev in Thread] | Current Thread | [Next in Thread> |
|---|---|---|
| Previous by Date: | Re: Regex sub -expressions: 00021, Paolo Bonzini |
|---|---|
| Next by Date: | Re: Regex sub -expressions: 00021, Mike Anderson |
| Previous by Thread: | Bloxi: 00021, Holger Peters |
| Next by Thread: | Re: [rfc] regex rewrite: 00021, Mike Anderson |
| Indexes: | [Date] [Thread] [Top] [All Lists] |
| News | FAQ | advertise |