PK!ㄮurl_normalize/__init__.py# -*- coding: utf-8 -*- """ URI normalizator. URI Normalization function: * Take care of IDN domains. * Always provide the URI scheme in lowercase characters. * Always provide the host, if any, in lowercase characters. * Only perform percent-encoding where it is essential. * Always use uppercase A-through-F characters when percent-encoding. * Prevent dot-segments appearing in non-relative URI paths. * For schemes that define a default authority, use an empty authority if the default is desired. * For schemes that define an empty path to be equivalent to a path of "/", use "/". * For schemes that define a port, use an empty port if the default is desired * All portions of the URI must be utf-8 encoded NFC from Unicode strings Inspired by Sam Ruby's urlnorm.py: http://intertwingly.net/blog/2004/08/04/Urlnorm This fork author: Nikolay Panov () History: * 1.4.0: A bit of code refactoring and cleanup * 1.3.2: Support empty string and double slash urls (//domain.tld) * 1.3.1: Same code support both Python 3 and Python 2. * 1.3: Python 3 compatibility * 1.2: PEP8, setup.py * 1.1.2: support for shebang (#!) urls * 1.1.1: using 'http' schema by default when appropriate * 1.1: added handling of IDN domains * 1.0: code pep8-zation * 0.1: forked from Sam Ruby's urlnorm.py """ from __future__ import absolute_import from .url_normalize import url_normalize __license__ = "Python" __version__ = "1.4.0" __all__ = ["url_normalize"] PK!yt!  url_normalize/tools.py"""Url normalize tools (py27/py37 compatible).""" import re import unicodedata from collections import namedtuple import six from six.moves.urllib.parse import quote as quote_orig from six.moves.urllib.parse import unquote as unquote_orig from six.moves.urllib.parse import urlsplit, urlunsplit URL = namedtuple( "URL", ["scheme", "userinfo", "host", "port", "path", "query", "fragment"] ) def deconstruct_url(url): """Tranform the url into URL structure. Params: url : string : the URL Returns: URL """ scheme, auth, path, query, fragment = urlsplit(url.strip()) (userinfo, host, port) = re.search("([^@]*@)?([^:]*):?(.*)", auth).groups() return URL( fragment=fragment, host=host, path=path, port=port, query=query, scheme=scheme, userinfo=userinfo or "", ) def reconstruct_url(url): """Reconstruct string url from URL. Params: url : URL object instance Returns: string : reconstructed url string """ auth = (url.userinfo or "") + url.host if url.port: auth += ":" + url.port return urlunsplit((url.scheme, auth, url.path, url.query, url.fragment)) def force_unicode(string, charset="utf-8"): """Convert string to unicode if it is not yet unicode. Params: string : string/unicode : an input string charset : string : optional : output encoding Returns: unicode """ if isinstance(string, six.text_type): # Always True on Py3 return string return string.decode(charset, "replace") # Py2 only def unquote(string, charset="utf-8"): """Unquote and normalize unicode string. Params: string : string to be unquoted charset : string : optional : output encoding Returns: string : an unquoted and normalized string """ string = unquote_orig(string) string = force_unicode(string, charset) string = unicodedata.normalize("NFC", string).encode(charset) return string def quote(string, safe="/"): """Quote string. Params: string : string to be quoted safe : string of safe characters Returns: string : quoted string """ string = quote_orig(string, safe) return string PK!.url_normalize/url_normalize.py# -*- coding: utf-8 -*- """URL normalize main module.""" import re from .tools import deconstruct_url, force_unicode, quote, reconstruct_url, unquote DEFAULT_PORT = { "ftp": "21", "gopher": "70", "http": "80", "https": "443", "news": "119", "nntp": "119", "snews": "563", "snntp": "563", "telnet": "23", "ws": "80", "wss": "443", } DEFAULT_SCHEME = "https" def provide_url_scheme(url): """Make sure we have valid url scheme. Params: url : string : the URL Returns: string : updated url with validated/attached scheme """ has_scheme = ":" in url[:7] is_default_scheme = url.startswith("//") is_file_path = url == "-" or (url.startswith("/") and not is_default_scheme) if not url or has_scheme or is_file_path: return url if is_default_scheme: return DEFAULT_SCHEME + ":" + url return DEFAULT_SCHEME + "://" + url def generic_url_cleanup(url): """Cleanup the URL from unnecessary data and convert to final form. Converts shebang urls to final form, removed unnecessary data from the url. Params: url : string : the URL Returns: string : update url """ url = url.replace("#!", "?_escaped_fragment_=") url = re.sub(r"utm_source=[^&]+&?", "", url) url = url.rstrip("&? ") return url def normalize_scheme(scheme): """Normalize scheme part of the url. Params: scheme : string : url scheme, e.g., 'https' Returns: string : normalized scheme data. """ return scheme.lower() def normalize_userinfo(userinfo): """Normalize userinfo part of the url. Params: userinfo : string : url userinfo, e.g., 'user@' Returns: string : normalized userinfo data. """ if userinfo in ["@", ":@"]: return "" return userinfo def normalize_host(host, charset="utf-8"): """Normalize host part of the url. Lowercase and strip of final dot. Also, take care about IDN domains. Params: host : string : url host, e.g., 'site.com' Returns: string : normalized host data. """ host = force_unicode(host, charset) host = host.lower() host = host.strip(".") host = host.encode("idna").decode(charset) return host def normalize_port(port, scheme): """Normalize port part of the url. Remove mention of default port number Params: port : string : url port, e.g., '8080' scheme : string : url scheme, e.g., 'http' Returns: string : normalized port data. """ if not port.isdigit(): return port port = str(int(port)) if DEFAULT_PORT[scheme] == port: return "" return port def normalize_path(path, scheme): """Normalize path part of the url. Remove mention of default path number Params: path : string : url path, e.g., '/section/page.html' scheme : string : url scheme, e.g., 'http' Returns: string : normalized path data. """ # Only perform percent-encoding where it is essential. # Always use uppercase A-through-F characters when percent-encoding. # All portions of the URI must be utf-8 encoded NFC from Unicode strings path = quote(unquote(path), "~:/?#[]@!$&'()*+,;=") # Prevent dot-segments appearing in non-relative URI paths. if scheme in ["", "http", "https", "ftp", "file"]: output, part = [], None for part in path.split("/"): if part == "": if not output: output.append(part) elif part == ".": pass elif part == "..": if len(output) > 1: output.pop() else: output.append(part) if part in ["", ".", ".."]: output.append("") path = "/".join(output) # For schemes that define an empty path to be equivalent to a path of "/", # use "/". if not path and scheme in ["http", "https", "ftp", "file"]: path = "/" return path def normalize_fragment(fragment): """Normalize fragment part of the url. Params: fragment : string : url fragment, e.g., 'fragment' Returns: string : normalized fragment data. """ return quote(unquote(fragment), "~") def normalize_query(query): """Normalize query part of the url. Params: query : string : url query, e.g., 'param1=val1¶m2=val2' Returns: string : normalized query data. """ query = "&".join( sorted( [ "=".join( [quote(unquote(t), "~:/?#[]@!$'()*+,;=") for t in q.split("=", 1)] ) for q in query.split("&") ] ) ) return query def url_normalize(url, charset="utf-8"): """URI normalization routine. Sometimes you get an URL by a user that just isn't a real URL because it contains unsafe characters like ' ' and so on. This function can fix some of the problems in a similar way browsers handle data entered by the user: >>> url_normalize('http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' Params: charset : string : optional The target charset for the URL if the url was given as unicode string. """ if not url: return url url = provide_url_scheme(url) url = generic_url_cleanup(url) url_elements = deconstruct_url(url) url_elements = url_elements._replace( scheme=normalize_scheme(url_elements.scheme), userinfo=normalize_userinfo(url_elements.userinfo), host=normalize_host(url_elements.host, charset), query=normalize_query(url_elements.query), fragment=normalize_fragment(url_elements.fragment), ) url_elements = url_elements._replace( port=normalize_port(url_elements.port, url_elements.scheme), path=normalize_path(url_elements.path, url_elements.scheme), ) url = reconstruct_url(url_elements) return url PK!H@ WX#url_normalize-1.4.0.dist-info/WHEEL A н#f DI޾p}pfCSmֻ.,"檸m - |PK!H;ݺs &url_normalize-1.4.0.dist-info/METADATAVms6 _zݳ(ILtKfڹ>z+-HʎHVgmnx7D.~+kR RP;J?}Q#,kR<+`фږ2!T> E-c 2Nexcfk][DX#jBI#Y Sbl*#\]+'}Nɔ?ãCa?'1iZ)0ή(KeNuI4%NӇg8|>`g7)||Hk>?d'#nY))bѭk/Ϝ̚ M>4r$#r5훝c㣏/js"Ӱ(e[:7럖N8)ҍd?3rx@^h rugR!y]~?&Cc $dI+xr2ݩފCFB!e@ۭtNd[a j S}ott8uJ9lNʃзB诮(":&' '7(ug GA:1.E dD j@K|?dYFN\\zRFhz Dwă.8]Y$ѷ!5ЭrS0e*:62 VΖpiIʲgezxDs^5P8;\آn!^jIr'1~^v6e%rϾ`C*UwVjJ9EeȼU;{CdSE/[qәO`~51L I$_f38?'@+L}hX z2[5N~ĩn^ZjCΞw@~A l 'D {v+kiԧIǃu.)ƽ?폏Na=i0%P!Rjqip-N;^vi5s[/52 ^A/0n$%^.9[Z쑾6ݗ)қ[|Z*m"ū um@5\wF-"=Rw5;Z8J ;DNT ^MSUduu; #+"/૝n~ `8NؿPK!Hc,$url_normalize-1.4.0.dist-info/RECORDлv0g! T*Q ֧o. T)eM=)MS#~$+lg_%x^䳱h>McKAVA]S,eOgj;S0mi(uQ͹շ~V]\a$AROfV#%kcLJJCC[dϝW q,B1#lۆ