PK!g$$url_normalize/__init__.py# -*- coding: utf-8 -*- """ URI normalizator. URI Normalization function: * Take care of IDN domains. * Always provide the URI scheme in lowercase characters. * Always provide the host, if any, in lowercase characters. * Only perform percent-encoding where it is essential. * Always use uppercase A-through-F characters when percent-encoding. * Prevent dot-segments appearing in non-relative URI paths. * For schemes that define a default authority, use an empty authority if the default is desired. * For schemes that define an empty path to be equivalent to a path of "/", use "/". * For schemes that define a port, use an empty port if the default is desired * All portions of the URI must be utf-8 encoded NFC from Unicode strings Inspired by Sam Ruby's urlnorm.py: http://intertwingly.net/blog/2004/08/04/Urlnorm This fork author: Nikolay Panov () """ from __future__ import absolute_import from .url_normalize import url_normalize __license__ = "Python" __version__ = "1.4.1" __all__ = ["url_normalize"] PK!yt!  url_normalize/tools.py"""Url normalize tools (py27/py37 compatible).""" import re import unicodedata from collections import namedtuple import six from six.moves.urllib.parse import quote as quote_orig from six.moves.urllib.parse import unquote as unquote_orig from six.moves.urllib.parse import urlsplit, urlunsplit URL = namedtuple( "URL", ["scheme", "userinfo", "host", "port", "path", "query", "fragment"] ) def deconstruct_url(url): """Tranform the url into URL structure. Params: url : string : the URL Returns: URL """ scheme, auth, path, query, fragment = urlsplit(url.strip()) (userinfo, host, port) = re.search("([^@]*@)?([^:]*):?(.*)", auth).groups() return URL( fragment=fragment, host=host, path=path, port=port, query=query, scheme=scheme, userinfo=userinfo or "", ) def reconstruct_url(url): """Reconstruct string url from URL. Params: url : URL object instance Returns: string : reconstructed url string """ auth = (url.userinfo or "") + url.host if url.port: auth += ":" + url.port return urlunsplit((url.scheme, auth, url.path, url.query, url.fragment)) def force_unicode(string, charset="utf-8"): """Convert string to unicode if it is not yet unicode. Params: string : string/unicode : an input string charset : string : optional : output encoding Returns: unicode """ if isinstance(string, six.text_type): # Always True on Py3 return string return string.decode(charset, "replace") # Py2 only def unquote(string, charset="utf-8"): """Unquote and normalize unicode string. Params: string : string to be unquoted charset : string : optional : output encoding Returns: string : an unquoted and normalized string """ string = unquote_orig(string) string = force_unicode(string, charset) string = unicodedata.normalize("NFC", string).encode(charset) return string def quote(string, safe="/"): """Quote string. Params: string : string to be quoted safe : string of safe characters Returns: string : quoted string """ string = quote_orig(string, safe) return string PK!B>url_normalize/url_normalize.py# -*- coding: utf-8 -*- """URL normalize main module.""" import re from .tools import deconstruct_url, force_unicode, quote, reconstruct_url, unquote DEFAULT_PORT = { "ftp": "21", "gopher": "70", "http": "80", "https": "443", "news": "119", "nntp": "119", "snews": "563", "snntp": "563", "telnet": "23", "ws": "80", "wss": "443", } DEFAULT_CHARSET = "utf-8" DEFAULT_SCHEME = "https" def provide_url_scheme(url, default_scheme=DEFAULT_SCHEME): """Make sure we have valid url scheme. Params: url : string : the URL default_scheme : string : default scheme to use, e.g. 'https' Returns: string : updated url with validated/attached scheme """ has_scheme = ":" in url[:7] is_universal_scheme = url.startswith("//") is_file_path = url == "-" or (url.startswith("/") and not is_universal_scheme) if not url or has_scheme or is_file_path: return url if is_universal_scheme: return default_scheme + ":" + url return default_scheme + "://" + url def generic_url_cleanup(url): """Cleanup the URL from unnecessary data and convert to final form. Converts shebang urls to final form, removed unnecessary data from the url. Params: url : string : the URL Returns: string : update url """ url = url.replace("#!", "?_escaped_fragment_=") url = re.sub(r"utm_source=[^&]+&?", "", url) url = url.rstrip("&? ") return url def normalize_scheme(scheme): """Normalize scheme part of the url. Params: scheme : string : url scheme, e.g., 'https' Returns: string : normalized scheme data. """ return scheme.lower() def normalize_userinfo(userinfo): """Normalize userinfo part of the url. Params: userinfo : string : url userinfo, e.g., 'user@' Returns: string : normalized userinfo data. """ if userinfo in ["@", ":@"]: return "" return userinfo def normalize_host(host, charset=DEFAULT_CHARSET): """Normalize host part of the url. Lowercase and strip of final dot. Also, take care about IDN domains. Params: host : string : url host, e.g., 'site.com' Returns: string : normalized host data. """ host = force_unicode(host, charset) host = host.lower() host = host.strip(".") host = host.encode("idna").decode(charset) return host def normalize_port(port, scheme): """Normalize port part of the url. Remove mention of default port number Params: port : string : url port, e.g., '8080' scheme : string : url scheme, e.g., 'http' Returns: string : normalized port data. """ if not port.isdigit(): return port port = str(int(port)) if DEFAULT_PORT[scheme] == port: return "" return port def normalize_path(path, scheme): """Normalize path part of the url. Remove mention of default path number Params: path : string : url path, e.g., '/section/page.html' scheme : string : url scheme, e.g., 'http' Returns: string : normalized path data. """ # Only perform percent-encoding where it is essential. # Always use uppercase A-through-F characters when percent-encoding. # All portions of the URI must be utf-8 encoded NFC from Unicode strings path = quote(unquote(path), "~:/?#[]@!$&'()*+,;=") # Prevent dot-segments appearing in non-relative URI paths. if scheme in ["", "http", "https", "ftp", "file"]: output, part = [], None for part in path.split("/"): if part == "": if not output: output.append(part) elif part == ".": pass elif part == "..": if len(output) > 1: output.pop() else: output.append(part) if part in ["", ".", ".."]: output.append("") path = "/".join(output) # For schemes that define an empty path to be equivalent to a path of "/", # use "/". if not path and scheme in ["http", "https", "ftp", "file"]: path = "/" return path def normalize_fragment(fragment): """Normalize fragment part of the url. Params: fragment : string : url fragment, e.g., 'fragment' Returns: string : normalized fragment data. """ return quote(unquote(fragment), "~") def normalize_query(query): """Normalize query part of the url. Params: query : string : url query, e.g., 'param1=val1¶m2=val2' Returns: string : normalized query data. """ query = "&".join( sorted( [ "=".join( [quote(unquote(t), "~:/?#[]@!$'()*+,;=") for t in q.split("=", 1)] ) for q in query.split("&") ] ) ) return query def url_normalize(url, charset=DEFAULT_CHARSET, default_scheme=DEFAULT_SCHEME): """URI normalization routine. Sometimes you get an URL by a user that just isn't a real URL because it contains unsafe characters like ' ' and so on. This function can fix some of the problems in a similar way browsers handle data entered by the user: >>> url_normalize('http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' Params: charset : string : optional The target charset for the URL if the url was given as unicode string. Returns: string : a normalized url """ if not url: return url url = provide_url_scheme(url, default_scheme) url = generic_url_cleanup(url) url_elements = deconstruct_url(url) url_elements = url_elements._replace( scheme=normalize_scheme(url_elements.scheme), userinfo=normalize_userinfo(url_elements.userinfo), host=normalize_host(url_elements.host, charset), query=normalize_query(url_elements.query), fragment=normalize_fragment(url_elements.fragment), ) url_elements = url_elements._replace( port=normalize_port(url_elements.port, url_elements.scheme), path=normalize_path(url_elements.path, url_elements.scheme), ) url = reconstruct_url(url_elements) return url PK!H@ WX#url_normalize-1.4.1.dist-info/WHEEL A н#f DI޾p}pfCSmֻ.,"檸m - |PK!HR@  &url_normalize-1.4.1.dist-info/METADATAVms6 _zݳ(ILtfڹ>z--Hʎ襉6xwD.+kR[RP;J?}QTR&0hΚPX^RFX#ZB8^PK26ʫ ]GoU&GśŻ?l徍hkzcv\GB]Y-8nHB:?(v.>Nќ?£}ǝ0  <ىޫƙk'R5f]c MĐ4>@xvʇͦ?y2aBLI\Vn]3x!}TEN ҄}S!G1*[خ?>)eȕ2 D;TS€,ܜSz(W1{1C@@-ނQ w⤣o@J-O”UV! 9,NO`l F @Y6"HX6p!J8͞mA4Uml Å-⥶x$qregkQVZ"? =TŠ]ug5XqQw/8( 5PWu>{ʓ]heh KRWXQfƩu/8#QKKcA/т*Bf v+kiԧIdغuO{^^}юG 91fۥ Gح@OA.vC@%]&* C0uժc9R9AݑmHGܜ0n,I1O{F ܢ_77LT/e+${{>#w*/W8{4iм=^݂:ihvPsK* 1BQkB]ZBj+YQYbDWĦv,8?sPK!H )$url_normalize-1.4.1.dist-info/RECORD̹@oa9j3PA |V&kிICcґc2JI8Túņ;'gw:iIF0G$xc7[gl],Y$zril'Fmu|l 8Gb c>'Wh*[D1 X q-ES9FH_ƆZ۠Ò/BݧtU"@̶JJ\G^ "GwH=D)q|PK!g$$url_normalize/__init__.pyPK!yt!  [url_normalize/tools.pyPK!B> url_normalize/url_normalize.pyPK!H@ WX#&url_normalize-1.4.1.dist-info/WHEELPK!HR@  &j'url_normalize-1.4.1.dist-info/METADATAPK!H )$,url_normalize-1.4.1.dist-info/RECORDPK-