"""RFC 3986 compliant, scheme-agnostic replacement for `urllib.parse`. This module defines RFC 3986 compliant replacements for the most commonly used functions of the Python Standard Library :mod:`urllib.parse` module. """ import collections import collections.abc import ipaddress import numbers import re from string import hexdigits __all__ = ( "GEN_DELIMS", "RESERVED", "SUB_DELIMS", "UNRESERVED", "isabspath", "isabsuri", "isnetpath", "isrelpath", "issamedoc", "isuri", "uricompose", "uridecode", "uridefrag", "uriencode", "urijoin", "urisplit", "uriunsplit", ) __version__ = "4.0.2" # RFC 3986 2.2. Reserved Characters # # reserved = gen-delims / sub-delims # # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" # # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" # / "*" / "+" / "," / ";" / "=" # GEN_DELIMS = ":/?#[]@" SUB_DELIMS = "!$&'()*+,;=" RESERVED = GEN_DELIMS + SUB_DELIMS # RFC 3986 2.3. Unreserved Characters # # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" # UNRESERVED = ( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789" "-._~" ) _unreserved = frozenset(UNRESERVED.encode()) # RFC 3986 2.1: For consistency, URI producers and normalizers should # use uppercase hexadecimal digits for all percent-encodings. _encoded = { b"": [ bytes([i]) if i in _unreserved else ("%%%02X" % i).encode() for i in range(256) ] } _decoded = { (a + b).encode(): bytes.fromhex(a + b) for a in hexdigits for b in hexdigits } def uriencode(uristring, safe="", encoding="utf-8", errors="strict"): """Encode a URI string or string component.""" if not isinstance(uristring, bytes): uristring = uristring.encode(encoding, errors) if not isinstance(safe, bytes): safe = safe.encode("ascii") try: encoded = _encoded[safe] except KeyError: encoded = _encoded[b""][:] for i in safe: encoded[i] = bytes([i]) _encoded[safe] = encoded return b"".join(map(encoded.__getitem__, uristring)) def uridecode(uristring, encoding="utf-8", errors="strict"): """Decode a URI string or string component.""" if not isinstance(uristring, bytes): uristring = uristring.encode(encoding or "ascii", errors) parts = uristring.split(b"%") result = [parts[0]] append = result.append decode = _decoded.get for s in parts[1:]: append(decode(s[:2], b"%" + s[:2])) append(s[2:]) if encoding is not None: return b"".join(result).decode(encoding, errors) else: return b"".join(result) class DefragResult(collections.namedtuple("DefragResult", "uri fragment")): """Class to hold :func:`uridefrag` results.""" __slots__ = () # prevent creation of instance dictionary def geturi(self): """Return the recombined version of the original URI as a string.""" fragment = self.fragment if fragment is None: return self.uri elif isinstance(fragment, bytes): return self.uri + b"#" + fragment else: return self.uri + "#" + fragment def getfragment(self, default=None, encoding="utf-8", errors="strict"): """Return the decoded fragment identifier, or `default` if the original URI did not contain a fragment component. """ fragment = self.fragment if fragment is not None: return uridecode(fragment, encoding, errors) else: return default class SplitResult( collections.namedtuple("SplitResult", "scheme authority path query fragment") ): """Base class to hold :func:`urisplit` results.""" __slots__ = () # prevent creation of instance dictionary @property def userinfo(self): authority = self.authority if authority is None: return None userinfo, present, _ = authority.rpartition(self.AT) if present: return userinfo else: return None @property def host(self): authority = self.authority if authority is None: return None _, _, hostinfo = authority.rpartition(self.AT) host, _, port = hostinfo.rpartition(self.COLON) if port.lstrip(self.DIGITS): return hostinfo else: return host @property def port(self): authority = self.authority if authority is None: return None _, present, port = authority.rpartition(self.COLON) if present and not port.lstrip(self.DIGITS): return port else: return None def geturi(self): """Return the re-combined version of the original URI reference as a string. """ scheme, authority, path, query, fragment = self # RFC 3986 5.3. Component Recomposition result = [] if scheme is not None: result.extend([scheme, self.COLON]) if authority is not None: result.extend([self.SLASH, self.SLASH, authority]) result.append(path) if query is not None: result.extend([self.QUEST, query]) if fragment is not None: result.extend([self.HASH, fragment]) return self.EMPTY.join(result) def getscheme(self, default=None): """Return the URI scheme in canonical (lowercase) form, or `default` if the original URI reference did not contain a scheme component. """ scheme = self.scheme if scheme is None: return default elif isinstance(scheme, bytes): return scheme.decode("ascii").lower() else: return scheme.lower() def getauthority(self, default=None, encoding="utf-8", errors="strict"): """Return the decoded userinfo, host and port subcomponents of the URI authority as a three-item tuple. """ # TBD: (userinfo, host, port) kwargs, default string? if default is None: default = (None, None, None) elif not isinstance(default, collections.abc.Iterable): raise TypeError("Invalid default type") elif len(default) != 3: raise ValueError("Invalid default length") # TODO: this could be much more efficient by using a dedicated regex return ( self.getuserinfo(default[0], encoding, errors), self.gethost(default[1], errors), self.getport(default[2]), ) def getuserinfo(self, default=None, encoding="utf-8", errors="strict"): """Return the decoded userinfo subcomponent of the URI authority, or `default` if the original URI reference did not contain a userinfo field. """ userinfo = self.userinfo if userinfo is None: return default else: return uridecode(userinfo, encoding, errors) def gethost(self, default=None, errors="strict"): """Return the decoded host subcomponent of the URI authority as a string or an :mod:`ipaddress` address object, or `default` if the original URI reference did not contain a host. """ host = self.host if host is None or (not host and default is not None): return default elif host.startswith(self.LBRACKET) and host.endswith(self.RBRACKET): return self.__parse_ip_literal(host[1:-1]) elif host.startswith(self.LBRACKET) or host.endswith(self.RBRACKET): raise ValueError("Invalid host %r" % host) # TODO: faster check for IPv4 address? try: if isinstance(host, bytes): return ipaddress.IPv4Address(host.decode("ascii")) else: return ipaddress.IPv4Address(host) except ValueError: return uridecode(host, "utf-8", errors).lower() def getport(self, default=None): """Return the port subcomponent of the URI authority as an :class:`int`, or `default` if the original URI reference did not contain a port or if the port was empty. """ port = self.port if port: return int(port) else: return default def getpath(self, encoding="utf-8", errors="strict"): """Return the normalized decoded URI path.""" path = self.__remove_dot_segments(self.path) return uridecode(path, encoding, errors) def getquery(self, default=None, encoding="utf-8", errors="strict"): """Return the decoded query string, or `default` if the original URI reference did not contain a query component. """ query = self.query if query is None: return default else: return uridecode(query, encoding, errors) def getquerydict(self, sep="&", encoding="utf-8", errors="strict"): """Split the query component into individual `name=value` pairs separated by `sep` and return a dictionary of query variables. The dictionary keys are the unique query variable names and the values are lists of values for each name. """ dict = collections.defaultdict(list) for name, value in self.getquerylist(sep, encoding, errors): dict[name].append(value) return dict def getquerylist(self, sep="&", encoding="utf-8", errors="strict"): """Split the query component into individual `name=value` pairs separated by `sep`, and return a list of `(name, value)` tuples. """ if not self.query: return [] elif isinstance(sep, type(self.query)): qsl = self.query.split(sep) elif isinstance(sep, bytes): qsl = self.query.split(sep.decode("ascii")) else: qsl = self.query.split(sep.encode("ascii")) items = [] for parts in [qs.partition(self.EQ) for qs in qsl if qs]: name = uridecode(parts[0], encoding, errors) if parts[1]: value = uridecode(parts[2], encoding, errors) else: value = None items.append((name, value)) return items def getfragment(self, default=None, encoding="utf-8", errors="strict"): """Return the decoded fragment identifier, or `default` if the original URI reference did not contain a fragment component. """ fragment = self.fragment if fragment is None: return default else: return uridecode(fragment, encoding, errors) def isuri(self): """Return :const:`True` if this is a URI.""" return self.scheme is not None def isabsuri(self): """Return :const:`True` if this is an absolute URI.""" return self.scheme is not None and self.fragment is None def isnetpath(self): """Return :const:`True` if this is a network-path reference.""" return self.scheme is None and self.authority is not None def isabspath(self): """Return :const:`True` if this is an absolute-path reference.""" return ( self.scheme is None and self.authority is None and self.path.startswith(self.SLASH) ) def isrelpath(self): """Return :const:`True` if this is a relative-path reference.""" return ( self.scheme is None and self.authority is None and not self.path.startswith(self.SLASH) ) def issamedoc(self): """Return :const:`True` if this is a same-document reference.""" return ( self.scheme is None and self.authority is None and not self.path and self.query is None ) def transform(self, ref, strict=False): """Transform a URI reference relative to `self` into a :class:`SplitResult` representing its target URI. """ scheme, authority, path, query, fragment = self.RE.match(ref).groups() # RFC 3986 5.2.2. Transform References if scheme is not None and (strict or scheme != self.scheme): path = self.__remove_dot_segments(path) elif authority is not None: scheme = self.scheme path = self.__remove_dot_segments(path) elif not path: scheme = self.scheme authority = self.authority path = self.path query = self.query if query is None else query elif path.startswith(self.SLASH): scheme = self.scheme authority = self.authority path = self.__remove_dot_segments(path) else: scheme = self.scheme authority = self.authority path = self.__remove_dot_segments(self.__merge(path)) return type(self)(scheme, authority, path, query, fragment) def __merge(self, path): # RFC 3986 5.2.3. Merge Paths if self.authority is not None and not self.path: return self.SLASH + path else: parts = self.path.rpartition(self.SLASH) return parts[1].join((parts[0], path)) @classmethod def __remove_dot_segments(cls, path): # RFC 3986 5.2.4. Remove Dot Segments pseg = [] for s in path.split(cls.SLASH): if s == cls.DOT: continue elif s != cls.DOTDOT: pseg.append(s) elif len(pseg) == 1 and not pseg[0]: continue elif pseg and pseg[-1] != cls.DOTDOT: pseg.pop() else: pseg.append(s) # adjust for trailing '/.' or '/..' if path.rpartition(cls.SLASH)[2] in (cls.DOT, cls.DOTDOT): pseg.append(cls.EMPTY) if path and len(pseg) == 1 and pseg[0] == cls.EMPTY: pseg.insert(0, cls.DOT) return cls.SLASH.join(pseg) @classmethod def __parse_ip_literal(cls, address): # RFC 3986 3.2.2: In anticipation of future, as-yet-undefined # IP literal address formats, an implementation may use an # optional version flag to indicate such a format explicitly # rather than rely on heuristic determination. # # IP-literal = "[" ( IPv6address / IPvFuture ) "]" # # IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) # # If a URI containing an IP-literal that starts with "v" # (case-insensitive), indicating that the version flag is # present, is dereferenced by an application that does not # know the meaning of that version flag, then the application # should return an appropriate error for "address mechanism # not supported". if isinstance(address, bytes): address = address.decode("ascii") if address.startswith("v"): raise ValueError("address mechanism not supported") return ipaddress.IPv6Address(address) class SplitResultBytes(SplitResult): __slots__ = () # prevent creation of instance dictionary # RFC 3986 Appendix B RE = re.compile( rb""" (?:([A-Za-z][A-Za-z0-9+.-]*):)? # scheme (RFC 3986 3.1) (?://([^/?#]*))? # authority ([^?#]*) # path (?:\?([^#]*))? # query (?:\#(.*))? # fragment """, flags=re.VERBOSE, ) # RFC 3986 2.2 gen-delims COLON, SLASH, QUEST, HASH, LBRACKET, RBRACKET, AT = ( b":", b"/", b"?", b"#", b"[", b"]", b"@", ) # RFC 3986 3.3 dot-segments DOT, DOTDOT = b".", b".." EMPTY, EQ = b"", b"=" DIGITS = b"0123456789" class SplitResultString(SplitResult): __slots__ = () # prevent creation of instance dictionary # RFC 3986 Appendix B RE = re.compile( r""" (?:([A-Za-z][A-Za-z0-9+.-]*):)? # scheme (RFC 3986 3.1) (?://([^/?#]*))? # authority ([^?#]*) # path (?:\?([^#]*))? # query (?:\#(.*))? # fragment """, flags=re.VERBOSE, ) # RFC 3986 2.2 gen-delims COLON, SLASH, QUEST, HASH, LBRACKET, RBRACKET, AT = ( ":", "/", "?", "#", "[", "]", "@", ) # RFC 3986 3.3 dot-segments DOT, DOTDOT = ".", ".." EMPTY, EQ = "", "=" DIGITS = "0123456789" def uridefrag(uristring): """Remove an existing fragment component from a URI reference string.""" if isinstance(uristring, bytes): parts = uristring.partition(b"#") else: parts = uristring.partition("#") return DefragResult(parts[0], parts[2] if parts[1] else None) def urisplit(uristring): """Split a well-formed URI reference string into a tuple with five components corresponding to a URI's general structure:: :///?# """ if isinstance(uristring, bytes): result = SplitResultBytes else: result = SplitResultString return result(*result.RE.match(uristring).groups()) def uriunsplit(parts): """Combine the elements of a five-item iterable into a URI reference's string representation. """ scheme, authority, path, query, fragment = parts if isinstance(path, bytes): result = SplitResultBytes else: result = SplitResultString return result(scheme, authority, path, query, fragment).geturi() def urijoin(base, ref, strict=False): """Convert a URI reference relative to a base URI to its target URI string. """ if isinstance(base, type(ref)): return urisplit(base).transform(ref, strict).geturi() elif isinstance(base, bytes): return urisplit(base.decode()).transform(ref, strict).geturi() else: return urisplit(base).transform(ref.decode(), strict).geturi() def isuri(uristring): """Return :const:`True` if `uristring` is a URI.""" return urisplit(uristring).isuri() def isabsuri(uristring): """Return :const:`True` if `uristring` is an absolute URI.""" return urisplit(uristring).isabsuri() def isnetpath(uristring): """Return :const:`True` if `uristring` is a network-path reference.""" return urisplit(uristring).isnetpath() def isabspath(uristring): """Return :const:`True` if `uristring` is an absolute-path reference.""" return urisplit(uristring).isabspath() def isrelpath(uristring): """Return :const:`True` if `uristring` is a relative-path reference.""" return urisplit(uristring).isrelpath() def issamedoc(uristring): """Return :const:`True` if `uristring` is a same-document reference.""" return urisplit(uristring).issamedoc() # TBD: move compose to its own submodule? # RFC 3986 3.1: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) _SCHEME_RE = re.compile(b"^[A-Za-z][A-Za-z0-9+.-]*$") # RFC 3986 3.2: authority = [ userinfo "@" ] host [ ":" port ] _AUTHORITY_RE_BYTES = re.compile(b"^(?:(.*)@)?(.*?)(?::([0-9]*))?$") _AUTHORITY_RE_STR = re.compile("^(?:(.*)@)?(.*?)(?::([0-9]*))?$") # safe component characters _SAFE_USERINFO = SUB_DELIMS + ":" _SAFE_HOST = SUB_DELIMS _SAFE_PATH = SUB_DELIMS + ":@/" _SAFE_QUERY = SUB_DELIMS + ":@/?" _SAFE_FRAGMENT = SUB_DELIMS + ":@/?" def _scheme(scheme): if _SCHEME_RE.match(scheme): return scheme.lower() else: raise ValueError("Invalid scheme component") def _authority(userinfo, host, port, encoding): authority = [] if userinfo is not None: authority.append(uriencode(userinfo, _SAFE_USERINFO, encoding)) authority.append(b"@") if isinstance(host, ipaddress.IPv6Address): authority.append(b"[" + host.compressed.encode() + b"]") elif isinstance(host, ipaddress.IPv4Address): authority.append(host.compressed.encode()) elif isinstance(host, bytes): authority.append(_host(host)) elif host is not None: authority.append(_host(host.encode("utf-8"))) if isinstance(port, numbers.Number): authority.append(_port(str(port).encode())) elif isinstance(port, bytes): authority.append(_port(port)) elif port is not None: authority.append(_port(port.encode())) return b"".join(authority) if authority else None def _ip_literal(address): if address.startswith("v"): raise ValueError("Address mechanism not supported") else: return b"[" + ipaddress.IPv6Address(address).compressed.encode() + b"]" def _host(host): # RFC 3986 3.2.3: Although host is case-insensitive, producers and # normalizers should use lowercase for registered names and # hexadecimal addresses for the sake of uniformity, while only # using uppercase letters for percent-encodings. if host.startswith(b"[") and host.endswith(b"]"): return _ip_literal(host[1:-1].decode()) # check for IPv6 addresses as returned by SplitResult.gethost() try: return _ip_literal(host.decode("utf-8")) except ValueError: return uriencode(host.lower(), _SAFE_HOST, "utf-8") def _port(port): # RFC 3986 3.2.3: URI producers and normalizers should omit the # port component and its ":" delimiter if port is empty or if its # value would be the same as that of the scheme's default. if port.lstrip(b"0123456789"): raise ValueError("Invalid port subcomponent") elif port: return b":" + port else: return b"" def _querylist(items, sep, encoding): terms = [] append = terms.append safe = _SAFE_QUERY.replace(sep, "") for key, value in items: name = uriencode(key, safe, encoding) if value is None: append(name) elif isinstance(value, (bytes, str)): append(name + b"=" + uriencode(value, safe, encoding)) else: append(name + b"=" + uriencode(str(value), safe, encoding)) return sep.encode("ascii").join(terms) def _querydict(mapping, sep, encoding): items = [] for key, value in mapping.items(): if isinstance(value, (bytes, str)): items.append((key, value)) elif isinstance(value, collections.abc.Iterable): items.extend([(key, v) for v in value]) else: items.append((key, value)) return _querylist(items, sep, encoding) def uricompose( scheme=None, authority=None, path="", query=None, fragment=None, userinfo=None, host=None, port=None, querysep="&", encoding="utf-8", ): """Compose a URI reference string from its individual components.""" # RFC 3986 3.1: Scheme names consist of a sequence of characters # beginning with a letter and followed by any combination of # letters, digits, plus ("+"), period ("."), or hyphen ("-"). # Although schemes are case-insensitive, the canonical form is # lowercase and documents that specify schemes must do so with # lowercase letters. An implementation should accept uppercase # letters as equivalent to lowercase in scheme names (e.g., allow # "HTTP" as well as "http") for the sake of robustness but should # only produce lowercase scheme names for consistency. if isinstance(scheme, bytes): scheme = _scheme(scheme) elif scheme is not None: scheme = _scheme(scheme.encode()) # authority must be string type or three-item iterable if authority is None: authority = (None, None, None) elif isinstance(authority, bytes): authority = _AUTHORITY_RE_BYTES.match(authority).groups() elif isinstance(authority, str): authority = _AUTHORITY_RE_STR.match(authority).groups() elif not isinstance(authority, collections.abc.Iterable): raise TypeError("Invalid authority type") elif len(authority) != 3: raise ValueError("Invalid authority length") authority = _authority( userinfo if userinfo is not None else authority[0], host if host is not None else authority[1], port if port is not None else authority[2], encoding, ) # RFC 3986 3.3: If a URI contains an authority component, then the # path component must either be empty or begin with a slash ("/") # character. If a URI does not contain an authority component, # then the path cannot begin with two slash characters ("//"). path = uriencode(path, _SAFE_PATH, encoding) if authority is not None and path and not path.startswith(b"/"): raise ValueError("Invalid path with authority component") if authority is None and path.startswith(b"//"): raise ValueError("Invalid path without authority component") # RFC 3986 4.2: A path segment that contains a colon character # (e.g., "this:that") cannot be used as the first segment of a # relative-path reference, as it would be mistaken for a scheme # name. Such a segment must be preceded by a dot-segment (e.g., # "./this:that") to make a relative-path reference. if scheme is None and authority is None and not path.startswith(b"/"): if b":" in path.partition(b"/")[0]: path = b"./" + path # RFC 3986 3.4: The characters slash ("/") and question mark ("?") # may represent data within the query component. Beware that some # older, erroneous implementations may not handle such data # correctly when it is used as the base URI for relative # references (Section 5.1), apparently because they fail to # distinguish query data from path data when looking for # hierarchical separators. However, as query components are often # used to carry identifying information in the form of "key=value" # pairs and one frequently used value is a reference to another # URI, it is sometimes better for usability to avoid percent- # encoding those characters. if isinstance(query, (bytes, str)): query = uriencode(query, _SAFE_QUERY, encoding) elif isinstance(query, collections.abc.Mapping): query = _querydict(query, querysep, encoding) elif isinstance(query, collections.abc.Iterable): query = _querylist(query, querysep, encoding) elif query is not None: raise TypeError("Invalid query type") # RFC 3986 3.5: The characters slash ("/") and question mark ("?") # are allowed to represent data within the fragment identifier. # Beware that some older, erroneous implementations may not handle # this data correctly when it is used as the base URI for relative # references. if fragment is not None: fragment = uriencode(fragment, _SAFE_FRAGMENT, encoding) # return URI reference as `str` return uriunsplit((scheme, authority, path, query, fragment)).decode()