mirror of
https://github.com/python/cpython.git
synced 2024-11-28 04:15:11 +08:00
ef30dc872b
The nanoseconds saved by using dict.fromkeys aren't worth the loss in clarity. Linear searches live on.
273 lines
9.3 KiB
Python
273 lines
9.3 KiB
Python
"""Parse (absolute and relative) URLs.
|
|
|
|
See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
|
|
UC Irvine, June 1995.
|
|
"""
|
|
|
|
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
|
|
"urlsplit", "urlunsplit"]
|
|
|
|
# A classification of schemes ('' means apply by default)
|
|
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
|
|
'wais', 'file', 'https', 'shttp', 'mms',
|
|
'prospero', 'rtsp', 'rtspu', '']
|
|
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
|
|
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
|
|
'snews', 'prospero', 'rtsp', 'rtspu', '']
|
|
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
|
|
'telnet', 'wais', 'imap', 'snews', 'sip']
|
|
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
|
|
'https', 'shttp', 'rtsp', 'rtspu', 'sip',
|
|
'mms', '']
|
|
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
|
|
'gopher', 'rtsp', 'rtspu', 'sip', '']
|
|
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
|
|
'nntp', 'wais', 'https', 'shttp', 'snews',
|
|
'file', 'prospero', '']
|
|
|
|
# Characters valid in scheme names
|
|
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
'0123456789'
|
|
'+-.')
|
|
|
|
MAX_CACHE_SIZE = 20
|
|
_parse_cache = {}
|
|
|
|
def clear_cache():
|
|
"""Clear the parse cache."""
|
|
global _parse_cache
|
|
_parse_cache = {}
|
|
|
|
|
|
def urlparse(url, scheme='', allow_fragments=1):
|
|
"""Parse a URL into 6 components:
|
|
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
|
|
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
|
|
Note that we don't break the components up in smaller bits
|
|
(e.g. netloc is a single string) and we don't expand % escapes."""
|
|
tuple = urlsplit(url, scheme, allow_fragments)
|
|
scheme, netloc, url, query, fragment = tuple
|
|
if scheme in uses_params and ';' in url:
|
|
url, params = _splitparams(url)
|
|
else:
|
|
params = ''
|
|
return scheme, netloc, url, params, query, fragment
|
|
|
|
def _splitparams(url):
|
|
if '/' in url:
|
|
i = url.find(';', url.rfind('/'))
|
|
if i < 0:
|
|
return url, ''
|
|
else:
|
|
i = url.find(';')
|
|
return url[:i], url[i+1:]
|
|
|
|
def urlsplit(url, scheme='', allow_fragments=1):
|
|
"""Parse a URL into 5 components:
|
|
<scheme>://<netloc>/<path>?<query>#<fragment>
|
|
Return a 5-tuple: (scheme, netloc, path, query, fragment).
|
|
Note that we don't break the components up in smaller bits
|
|
(e.g. netloc is a single string) and we don't expand % escapes."""
|
|
key = url, scheme, allow_fragments
|
|
cached = _parse_cache.get(key, None)
|
|
if cached:
|
|
return cached
|
|
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
|
|
clear_cache()
|
|
netloc = query = fragment = ''
|
|
i = url.find(':')
|
|
if i > 0:
|
|
if url[:i] == 'http': # optimize the common case
|
|
scheme = url[:i].lower()
|
|
url = url[i+1:]
|
|
if url[:2] == '//':
|
|
i = url.find('/', 2)
|
|
if i < 0:
|
|
i = url.find('#')
|
|
if i < 0:
|
|
i = len(url)
|
|
netloc = url[2:i]
|
|
url = url[i:]
|
|
if allow_fragments and '#' in url:
|
|
url, fragment = url.split('#', 1)
|
|
if '?' in url:
|
|
url, query = url.split('?', 1)
|
|
tuple = scheme, netloc, url, query, fragment
|
|
_parse_cache[key] = tuple
|
|
return tuple
|
|
for c in url[:i]:
|
|
if c not in scheme_chars:
|
|
break
|
|
else:
|
|
scheme, url = url[:i].lower(), url[i+1:]
|
|
if scheme in uses_netloc:
|
|
if url[:2] == '//':
|
|
i = url.find('/', 2)
|
|
if i < 0:
|
|
i = len(url)
|
|
netloc, url = url[2:i], url[i:]
|
|
if allow_fragments and scheme in uses_fragment and '#' in url:
|
|
url, fragment = url.split('#', 1)
|
|
if scheme in uses_query and '?' in url:
|
|
url, query = url.split('?', 1)
|
|
tuple = scheme, netloc, url, query, fragment
|
|
_parse_cache[key] = tuple
|
|
return tuple
|
|
|
|
def urlunparse((scheme, netloc, url, params, query, fragment)):
|
|
"""Put a parsed URL back together again. This may result in a
|
|
slightly different, but equivalent URL, if the URL that was parsed
|
|
originally had redundant delimiters, e.g. a ? with an empty query
|
|
(the draft states that these are equivalent)."""
|
|
if params:
|
|
url = "%s;%s" % (url, params)
|
|
return urlunsplit((scheme, netloc, url, query, fragment))
|
|
|
|
def urlunsplit((scheme, netloc, url, query, fragment)):
|
|
if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
|
|
if url and url[:1] != '/': url = '/' + url
|
|
url = '//' + (netloc or '') + url
|
|
if scheme:
|
|
url = scheme + ':' + url
|
|
if query:
|
|
url = url + '?' + query
|
|
if fragment:
|
|
url = url + '#' + fragment
|
|
return url
|
|
|
|
def urljoin(base, url, allow_fragments = 1):
|
|
"""Join a base URL and a possibly relative URL to form an absolute
|
|
interpretation of the latter."""
|
|
if not base:
|
|
return url
|
|
if not url:
|
|
return base
|
|
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
|
|
urlparse(base, '', allow_fragments)
|
|
scheme, netloc, path, params, query, fragment = \
|
|
urlparse(url, bscheme, allow_fragments)
|
|
if scheme != bscheme or scheme not in uses_relative:
|
|
return url
|
|
if scheme in uses_netloc:
|
|
if netloc:
|
|
return urlunparse((scheme, netloc, path,
|
|
params, query, fragment))
|
|
netloc = bnetloc
|
|
if path[:1] == '/':
|
|
return urlunparse((scheme, netloc, path,
|
|
params, query, fragment))
|
|
if not path:
|
|
if not params:
|
|
params = bparams
|
|
if not query:
|
|
query = bquery
|
|
return urlunparse((scheme, netloc, bpath,
|
|
params, query, fragment))
|
|
segments = bpath.split('/')[:-1] + path.split('/')
|
|
# XXX The stuff below is bogus in various ways...
|
|
if segments[-1] == '.':
|
|
segments[-1] = ''
|
|
while '.' in segments:
|
|
segments.remove('.')
|
|
while 1:
|
|
i = 1
|
|
n = len(segments) - 1
|
|
while i < n:
|
|
if (segments[i] == '..'
|
|
and segments[i-1] not in ('', '..')):
|
|
del segments[i-1:i+1]
|
|
break
|
|
i = i+1
|
|
else:
|
|
break
|
|
if segments == ['', '..']:
|
|
segments[-1] = ''
|
|
elif len(segments) >= 2 and segments[-1] == '..':
|
|
segments[-2:] = ['']
|
|
return urlunparse((scheme, netloc, '/'.join(segments),
|
|
params, query, fragment))
|
|
|
|
def urldefrag(url):
|
|
"""Removes any existing fragment from URL.
|
|
|
|
Returns a tuple of the defragmented URL and the fragment. If
|
|
the URL contained no fragments, the second element is the
|
|
empty string.
|
|
"""
|
|
if '#' in url:
|
|
s, n, p, a, q, frag = urlparse(url)
|
|
defrag = urlunparse((s, n, p, a, q, ''))
|
|
return defrag, frag
|
|
else:
|
|
return url, ''
|
|
|
|
|
|
test_input = """
|
|
http://a/b/c/d
|
|
|
|
g:h = <URL:g:h>
|
|
http:g = <URL:http://a/b/c/g>
|
|
http: = <URL:http://a/b/c/d>
|
|
g = <URL:http://a/b/c/g>
|
|
./g = <URL:http://a/b/c/g>
|
|
g/ = <URL:http://a/b/c/g/>
|
|
/g = <URL:http://a/g>
|
|
//g = <URL:http://g>
|
|
?y = <URL:http://a/b/c/d?y>
|
|
g?y = <URL:http://a/b/c/g?y>
|
|
g?y/./x = <URL:http://a/b/c/g?y/./x>
|
|
. = <URL:http://a/b/c/>
|
|
./ = <URL:http://a/b/c/>
|
|
.. = <URL:http://a/b/>
|
|
../ = <URL:http://a/b/>
|
|
../g = <URL:http://a/b/g>
|
|
../.. = <URL:http://a/>
|
|
../../g = <URL:http://a/g>
|
|
../../../g = <URL:http://a/../g>
|
|
./../g = <URL:http://a/b/g>
|
|
./g/. = <URL:http://a/b/c/g/>
|
|
/./g = <URL:http://a/./g>
|
|
g/./h = <URL:http://a/b/c/g/h>
|
|
g/../h = <URL:http://a/b/c/h>
|
|
http:g = <URL:http://a/b/c/g>
|
|
http: = <URL:http://a/b/c/d>
|
|
http:?y = <URL:http://a/b/c/d?y>
|
|
http:g?y = <URL:http://a/b/c/g?y>
|
|
http:g?y/./x = <URL:http://a/b/c/g?y/./x>
|
|
"""
|
|
# XXX The result for //g is actually http://g/; is this a problem?
|
|
|
|
def test():
|
|
import sys
|
|
base = ''
|
|
if sys.argv[1:]:
|
|
fn = sys.argv[1]
|
|
if fn == '-':
|
|
fp = sys.stdin
|
|
else:
|
|
fp = open(fn)
|
|
else:
|
|
import StringIO
|
|
fp = StringIO.StringIO(test_input)
|
|
while 1:
|
|
line = fp.readline()
|
|
if not line: break
|
|
words = line.split()
|
|
if not words:
|
|
continue
|
|
url = words[0]
|
|
parts = urlparse(url)
|
|
print '%-10s : %s' % (url, parts)
|
|
abs = urljoin(base, url)
|
|
if not base:
|
|
base = abs
|
|
wrapped = '<URL:%s>' % abs
|
|
print '%-10s = %s' % (url, wrapped)
|
|
if len(words) == 3 and words[1] == '=':
|
|
if wrapped != words[2]:
|
|
print 'EXPECTED', words[2], '!!!!!!!!!!'
|
|
|
|
if __name__ == '__main__':
|
|
test()
|