cpython/Lib/regsub.py

# Regular expression subroutines:
# sub(pat, repl, str): replace first occurrence of pattern in string
# gsub(pat, repl, str): replace all occurrences of pattern in string
# split(str, pat, maxsplit): split string using pattern as delimiter
# splitx(str, pat, maxsplit): split string using pattern as delimiter plus
#			      return delimiters


import regex


# Replace first occurrence of pattern pat in string str by replacement
# repl.  If the pattern isn't found, the string is returned unchanged.
# The replacement may contain references \digit to subpatterns and
# escaped backslashes.  The pattern may be a string or an already
# compiled pattern.

def sub(pat, repl, str):
	prog = compile(pat)
	if prog.search(str) >= 0:
		regs = prog.regs
		a, b = regs[0]
		str = str[:a] + expand(repl, regs, str) + str[b:]
	return str


# Replace all (non-overlapping) occurrences of pattern pat in string
# str by replacement repl.  The same rules as for sub() apply.
# Empty matches for the pattern are replaced only when not adjacent to
# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.

def gsub(pat, repl, str):
	prog = compile(pat)
	new = ''
	start = 0
	first = 1
	while prog.search(str, start) >= 0:
		regs = prog.regs
		a, b = regs[0]
		if a == b == start and not first:
			if start >= len(str) or prog.search(str, start+1) < 0:
				break
			regs = prog.regs
			a, b = regs[0]
		new = new + str[start:a] + expand(repl, regs, str)
		start = b
		first = 0
	new = new + str[start:]
	return new


# Split string str in fields separated by delimiters matching pattern
# pat.  Only non-empty matches for the pattern are considered, so e.g.
# split('abc', '') returns ['abc'].
# The optional 3rd argument sets the number of splits that are performed.

def split(str, pat, maxsplit = 0):
	return intsplit(str, pat, maxsplit, 0)

# Split string str in fields separated by delimiters matching pattern
# pat.  Only non-empty matches for the pattern are considered, so e.g.
# split('abc', '') returns ['abc']. The delimiters are also included
# in the list.
# The optional 3rd argument sets the number of splits that are performed.


def splitx(str, pat, maxsplit = 0):
	return intsplit(str, pat, maxsplit, 1)
	
# Internal function used to implement split() and splitx().

def intsplit(str, pat, maxsplit, retain):
	prog = compile(pat)
	res = []
	start = next = 0
	splitcount = 0
	while prog.search(str, next) >= 0:
		regs = prog.regs
		a, b = regs[0]
		if a == b:
			next = next + 1
			if next >= len(str):
				break
		else:
			res.append(str[start:a])
			if retain:
				res.append(str[a:b])
			start = next = b
			splitcount = splitcount + 1
			if (maxsplit and (splitcount >= maxsplit)):
			    break
	res.append(str[start:])
	return res


# Capitalize words split using a pattern

def capwords(str, pat='[^a-zA-Z0-9_]+'):
	import string
	words = splitx(str, pat)
	for i in range(0, len(words), 2):
		words[i] = string.capitalize(words[i])
	return string.joinfields(words, "")


# Internal subroutines:
# compile(pat): compile a pattern, caching already compiled patterns
# expand(repl, regs, str): expand \digit escapes in replacement string


# Manage a cache of compiled regular expressions.
#
# If the pattern is a string a compiled version of it is returned.  If
# the pattern has been used before we return an already compiled
# version from the cache; otherwise we compile it now and save the
# compiled version in the cache, along with the syntax it was compiled
# with.  Instead of a string, a compiled regular expression can also
# be passed.

cache = {}

def compile(pat):
	if type(pat) <> type(''):
		return pat		# Assume it is a compiled regex
	key = (pat, regex.get_syntax())
	if cache.has_key(key):
		prog = cache[key]	# Get it from the cache
	else:
		prog = cache[key] = regex.compile(pat)
	return prog


def clear_cache():
	global cache
	cache = {}


# Expand \digit in the replacement.
# Each occurrence of \digit is replaced by the substring of str
# indicated by regs[digit].  To include a literal \ in the
# replacement, double it; other \ escapes are left unchanged (i.e.
# the \ and the following character are both copied).

def expand(repl, regs, str):
	if '\\' not in repl:
		return repl
	new = ''
	i = 0
	ord0 = ord('0')
	while i < len(repl):
		c = repl[i]; i = i+1
		if c <> '\\' or i >= len(repl):
			new = new + c
		else:
			c = repl[i]; i = i+1
			if '0' <= c <= '9':
				a, b = regs[ord(c)-ord0]
				new = new + str[a:b]
			elif c == '\\':
				new = new + c
			else:
				new = new + '\\' + c
	return new


# Test program, reads sequences "pat repl str" from stdin.
# Optional argument specifies pattern used to split lines.

def test():
	import sys
	if sys.argv[1:]:
		delpat = sys.argv[1]
	else:
		delpat = '[ \t\n]+'
	while 1:
		if sys.stdin.isatty(): sys.stderr.write('--> ')
		line = sys.stdin.readline()
		if not line: break
		if line[-1] == '\n': line = line[:-1]
		fields = split(line, delpat)
		if len(fields) <> 3:
			print 'Sorry, not three fields'
			print 'split:', `fields`
			continue
		[pat, repl, str] = split(line, delpat)
		print 'sub :', `sub(pat, repl, str)`
		print 'gsub:', `gsub(pat, repl, str)`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`# Regular expression subroutines:`
			`# sub(pat, repl, str): replace first occurrence of pattern in string`
			`# gsub(pat, repl, str): replace all occurrences of pattern in string`
Changed split() to be compatible with changes to string.split(): the optional third argument gives a maximum number of delimiters to parse. The new function splitx() is like split() but returns a list containing the words as well as the delimiters. 1996-08-09 02:39:18 +08:00			`# split(str, pat, maxsplit): split string using pattern as delimiter`
			`# splitx(str, pat, maxsplit): split string using pattern as delimiter plus`
			`# return delimiters`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00

			`import regex`


			`# Replace first occurrence of pattern pat in string str by replacement`
			`# repl. If the pattern isn't found, the string is returned unchanged.`
			`# The replacement may contain references \digit to subpatterns and`
			`# escaped backslashes. The pattern may be a string or an already`
			`# compiled pattern.`

			`def sub(pat, repl, str):`
			`prog = compile(pat)`
			`if prog.search(str) >= 0:`
			`regs = prog.regs`
			`a, b = regs[0]`
			`str = str[:a] + expand(repl, regs, str) + str[b:]`
			`return str`


			`# Replace all (non-overlapping) occurrences of pattern pat in string`
			`# str by replacement repl. The same rules as for sub() apply.`
			`# Empty matches for the pattern are replaced only when not adjacent to`
			`# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.`

			`def gsub(pat, repl, str):`
			`prog = compile(pat)`
			`new = ''`
			`start = 0`
			`first = 1`
			`while prog.search(str, start) >= 0:`
			`regs = prog.regs`
			`a, b = regs[0]`
			`if a == b == start and not first:`
			`if start >= len(str) or prog.search(str, start+1) < 0:`
			`break`
			`regs = prog.regs`
			`a, b = regs[0]`
			`new = new + str[start:a] + expand(repl, regs, str)`
			`start = b`
			`first = 0`
			`new = new + str[start:]`
			`return new`


			`# Split string str in fields separated by delimiters matching pattern`
			`# pat. Only non-empty matches for the pattern are considered, so e.g.`
			`# split('abc', '') returns ['abc'].`
Changed split() to be compatible with changes to string.split(): the optional third argument gives a maximum number of delimiters to parse. The new function splitx() is like split() but returns a list containing the words as well as the delimiters. 1996-08-09 02:39:18 +08:00			`# The optional 3rd argument sets the number of splits that are performed.`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00
Changed split() to be compatible with changes to string.split(): the optional third argument gives a maximum number of delimiters to parse. The new function splitx() is like split() but returns a list containing the words as well as the delimiters. 1996-08-09 02:39:18 +08:00			`def split(str, pat, maxsplit = 0):`
			`return intsplit(str, pat, maxsplit, 0)`

			`# Split string str in fields separated by delimiters matching pattern`
			`# pat. Only non-empty matches for the pattern are considered, so e.g.`
			`# split('abc', '') returns ['abc']. The delimiters are also included`
			`# in the list.`
			`# The optional 3rd argument sets the number of splits that are performed.`


			`def splitx(str, pat, maxsplit = 0):`
			`return intsplit(str, pat, maxsplit, 1)`

			`# Internal function used to implement split() and splitx().`

			`def intsplit(str, pat, maxsplit, retain):`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`prog = compile(pat)`
			`res = []`
			`start = next = 0`
Changed split() to be compatible with changes to string.split(): the optional third argument gives a maximum number of delimiters to parse. The new function splitx() is like split() but returns a list containing the words as well as the delimiters. 1996-08-09 02:39:18 +08:00			`splitcount = 0`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`while prog.search(str, next) >= 0:`
			`regs = prog.regs`
			`a, b = regs[0]`
			`if a == b:`
			`next = next + 1`
			`if next >= len(str):`
			`break`
			`else:`
			`res.append(str[start:a])`
Add third arg to split(). Add capwords() -- which uses that. 1996-06-12 02:45:15 +08:00			`if retain:`
			`res.append(str[a:b])`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`start = next = b`
Changed split() to be compatible with changes to string.split(): the optional third argument gives a maximum number of delimiters to parse. The new function splitx() is like split() but returns a list containing the words as well as the delimiters. 1996-08-09 02:39:18 +08:00			`splitcount = splitcount + 1`
			`if (maxsplit and (splitcount >= maxsplit)):`
			`break`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`res.append(str[start:])`
			`return res`


Add third arg to split(). Add capwords() -- which uses that. 1996-06-12 02:45:15 +08:00			`# Capitalize words split using a pattern`

Use splitx() in capwords() (bugfix after interface change for split()). Give capwords a default pattern argument which will recognize words as sequeces of [a-zA-Z0-9_]. 1996-08-10 05:32:29 +08:00			`def capwords(str, pat='[^a-zA-Z0-9_]+'):`
Add third arg to split(). Add capwords() -- which uses that. 1996-06-12 02:45:15 +08:00			`import string`
Use splitx() in capwords() (bugfix after interface change for split()). Give capwords a default pattern argument which will recognize words as sequeces of [a-zA-Z0-9_]. 1996-08-10 05:32:29 +08:00			`words = splitx(str, pat)`
Add third arg to split(). Add capwords() -- which uses that. 1996-06-12 02:45:15 +08:00			`for i in range(0, len(words), 2):`
			`words[i] = string.capitalize(words[i])`
			`return string.joinfields(words, "")`


New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`# Internal subroutines:`
			`# compile(pat): compile a pattern, caching already compiled patterns`
			`# expand(repl, regs, str): expand \digit escapes in replacement string`


			`# Manage a cache of compiled regular expressions.`
Store the current regex syntax along with the regular expression string as the key to the cache. This means that changing the syntax will return the correct compiled pattern. clear_cache(): New function. 1997-02-19 02:52:55 +08:00			`#`
			`# If the pattern is a string a compiled version of it is returned. If`
			`# the pattern has been used before we return an already compiled`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`# version from the cache; otherwise we compile it now and save the`
Store the current regex syntax along with the regular expression string as the key to the cache. This means that changing the syntax will return the correct compiled pattern. clear_cache(): New function. 1997-02-19 02:52:55 +08:00			`# compiled version in the cache, along with the syntax it was compiled`
			`# with. Instead of a string, a compiled regular expression can also`
			`# be passed.`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00
			`cache = {}`

			`def compile(pat):`
			`if type(pat) <> type(''):`
			`return pat # Assume it is a compiled regex`
Store the current regex syntax along with the regular expression string as the key to the cache. This means that changing the syntax will return the correct compiled pattern. clear_cache(): New function. 1997-02-19 02:52:55 +08:00			`key = (pat, regex.get_syntax())`
			`if cache.has_key(key):`
			`prog = cache[key] # Get it from the cache`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`else:`
Store the current regex syntax along with the regular expression string as the key to the cache. This means that changing the syntax will return the correct compiled pattern. clear_cache(): New function. 1997-02-19 02:52:55 +08:00			`prog = cache[key] = regex.compile(pat)`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`return prog`


Store the current regex syntax along with the regular expression string as the key to the cache. This means that changing the syntax will return the correct compiled pattern. clear_cache(): New function. 1997-02-19 02:52:55 +08:00			`def clear_cache():`
			`global cache`
			`cache = {}`


New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`# Expand \digit in the replacement.`
			`# Each occurrence of \digit is replaced by the substring of str`
			`# indicated by regs[digit]. To include a literal \ in the`
			`# replacement, double it; other \ escapes are left unchanged (i.e.`
			`# the \ and the following character are both copied).`

			`def expand(repl, regs, str):`
			`if '\\' not in repl:`
			`return repl`
			`new = ''`
			`i = 0`
avoid eval() like the plague 1996-05-29 07:01:28 +08:00			`ord0 = ord('0')`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`while i < len(repl):`
			`c = repl[i]; i = i+1`
			`if c <> '\\' or i >= len(repl):`
			`new = new + c`
			`else:`
			`c = repl[i]; i = i+1`
			`if '0' <= c <= '9':`
avoid eval() like the plague 1996-05-29 07:01:28 +08:00			`a, b = regs[ord(c)-ord0]`
New module regsub contains sub(), gsub() and split() as in nawk. string.splitfields(s, '') now returns [s] as split() in nawk. Added _exit to exported functions of os. 1992-09-21 05:41:09 +08:00			`new = new + str[a:b]`
			`elif c == '\\':`
			`new = new + c`
			`else:`
			`new = new + '\\' + c`
			`return new`


			`# Test program, reads sequences "pat repl str" from stdin.`
			`# Optional argument specifies pattern used to split lines.`

			`def test():`
			`import sys`
			`if sys.argv[1:]:`
			`delpat = sys.argv[1]`
			`else:`
			`delpat = '[ \t\n]+'`
			`while 1:`
			`if sys.stdin.isatty(): sys.stderr.write('--> ')`
			`line = sys.stdin.readline()`
			`if not line: break`
			`if line[-1] == '\n': line = line[:-1]`
			`fields = split(line, delpat)`
			`if len(fields) <> 3:`
			`print 'Sorry, not three fields'`
			print 'split:', `fields`
			`continue`
			`[pat, repl, str] = split(line, delpat)`
			print 'sub :', `sub(pat, repl, str)`
			print 'gsub:', `gsub(pat, repl, str)`