From 5856b85f74f04b254c6d4b5fd716242a6799826b Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Mon, 2 Dec 2019 18:44:43 -0600 Subject: [PATCH 1/7] Add stdlib modules necessary for http from CPython 3.6.0 --- Lib/calendar.py | 713 ++++++++++ Lib/code.py | 314 +++++ Lib/getpass.py | 186 +++ Lib/hmac.py | 144 ++ Lib/html/__init__.py | 132 ++ Lib/html/entities.py | 2509 +++++++++++++++++++++++++++++++++ Lib/html/parser.py | 470 +++++++ Lib/http/__init__.py | 134 ++ Lib/http/client.py | 1478 ++++++++++++++++++++ Lib/http/cookiejar.py | 2098 ++++++++++++++++++++++++++++ Lib/http/cookies.py | 635 +++++++++ Lib/http/server.py | 1211 ++++++++++++++++ Lib/mimetypes.py | 597 ++++++++ Lib/socketserver.py | 793 +++++++++++ Lib/stringprep.py | 272 ++++ Lib/urllib/__init__.py | 0 Lib/urllib/error.py | 81 ++ Lib/urllib/parse.py | 1009 ++++++++++++++ Lib/urllib/request.py | 2743 +++++++++++++++++++++++++++++++++++++ Lib/urllib/response.py | 80 ++ Lib/urllib/robotparser.py | 251 ++++ Lib/uu.py | 199 +++ Lib/uuid.py | 615 +++++++++ 23 files changed, 16664 insertions(+) create mode 100644 Lib/calendar.py create mode 100644 Lib/code.py create mode 100644 Lib/getpass.py create mode 100644 Lib/hmac.py create mode 100644 Lib/html/__init__.py create mode 100644 Lib/html/entities.py create mode 100644 Lib/html/parser.py create mode 100644 Lib/http/__init__.py create mode 100644 Lib/http/client.py create mode 100644 Lib/http/cookiejar.py create mode 100644 Lib/http/cookies.py create mode 100644 Lib/http/server.py create mode 100644 Lib/mimetypes.py create mode 100644 Lib/socketserver.py create mode 100644 Lib/stringprep.py create mode 100644 Lib/urllib/__init__.py create mode 100644 Lib/urllib/error.py create mode 100644 Lib/urllib/parse.py create mode 100644 Lib/urllib/request.py create mode 100644 Lib/urllib/response.py create mode 100644 Lib/urllib/robotparser.py create mode 100755 Lib/uu.py create mode 100644 Lib/uuid.py diff --git a/Lib/calendar.py b/Lib/calendar.py new file mode 100644 index 0000000000..07594f3a83 --- /dev/null +++ b/Lib/calendar.py @@ -0,0 +1,713 @@ +"""Calendar printing functions + +Note when comparing these calendars to the ones printed by cal(1): By +default, these calendars have Monday as the first day of the week, and +Sunday as the last (the European convention). Use setfirstweekday() to +set the first day of the week (0=Monday, 6=Sunday).""" + +import sys +import datetime +import locale as _locale +from itertools import repeat + +__all__ = ["IllegalMonthError", "IllegalWeekdayError", "setfirstweekday", + "firstweekday", "isleap", "leapdays", "weekday", "monthrange", + "monthcalendar", "prmonth", "month", "prcal", "calendar", + "timegm", "month_name", "month_abbr", "day_name", "day_abbr", + "Calendar", "TextCalendar", "HTMLCalendar", "LocaleTextCalendar", + "LocaleHTMLCalendar", "weekheader"] + +# Exception raised for bad input (with string parameter for details) +error = ValueError + +# Exceptions raised for bad input +class IllegalMonthError(ValueError): + def __init__(self, month): + self.month = month + def __str__(self): + return "bad month number %r; must be 1-12" % self.month + + +class IllegalWeekdayError(ValueError): + def __init__(self, weekday): + self.weekday = weekday + def __str__(self): + return "bad weekday number %r; must be 0 (Monday) to 6 (Sunday)" % self.weekday + + +# Constants for months referenced later +January = 1 +February = 2 + +# Number of days per month (except for February in leap years) +mdays = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] + +# This module used to have hard-coded lists of day and month names, as +# English strings. The classes following emulate a read-only version of +# that, but supply localized names. Note that the values are computed +# fresh on each call, in case the user changes locale between calls. + +class _localized_month: + + _months = [datetime.date(2001, i+1, 1).strftime for i in range(12)] + _months.insert(0, lambda x: "") + + def __init__(self, format): + self.format = format + + def __getitem__(self, i): + funcs = self._months[i] + if isinstance(i, slice): + return [f(self.format) for f in funcs] + else: + return funcs(self.format) + + def __len__(self): + return 13 + + +class _localized_day: + + # January 1, 2001, was a Monday. + _days = [datetime.date(2001, 1, i+1).strftime for i in range(7)] + + def __init__(self, format): + self.format = format + + def __getitem__(self, i): + funcs = self._days[i] + if isinstance(i, slice): + return [f(self.format) for f in funcs] + else: + return funcs(self.format) + + def __len__(self): + return 7 + + +# Full and abbreviated names of weekdays +day_name = _localized_day('%A') +day_abbr = _localized_day('%a') + +# Full and abbreviated names of months (1-based arrays!!!) +month_name = _localized_month('%B') +month_abbr = _localized_month('%b') + +# Constants for weekdays +(MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY, SUNDAY) = range(7) + + +def isleap(year): + """Return True for leap years, False for non-leap years.""" + return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) + + +def leapdays(y1, y2): + """Return number of leap years in range [y1, y2). + Assume y1 <= y2.""" + y1 -= 1 + y2 -= 1 + return (y2//4 - y1//4) - (y2//100 - y1//100) + (y2//400 - y1//400) + + +def weekday(year, month, day): + """Return weekday (0-6 ~ Mon-Sun) for year (1970-...), month (1-12), + day (1-31).""" + return datetime.date(year, month, day).weekday() + + +def monthrange(year, month): + """Return weekday (0-6 ~ Mon-Sun) and number of days (28-31) for + year, month.""" + if not 1 <= month <= 12: + raise IllegalMonthError(month) + day1 = weekday(year, month, 1) + ndays = mdays[month] + (month == February and isleap(year)) + return day1, ndays + + +class Calendar(object): + """ + Base calendar class. This class doesn't do any formatting. It simply + provides data to subclasses. + """ + + def __init__(self, firstweekday=0): + self.firstweekday = firstweekday # 0 = Monday, 6 = Sunday + + def getfirstweekday(self): + return self._firstweekday % 7 + + def setfirstweekday(self, firstweekday): + self._firstweekday = firstweekday + + firstweekday = property(getfirstweekday, setfirstweekday) + + def iterweekdays(self): + """ + Return an iterator for one week of weekday numbers starting with the + configured first one. + """ + for i in range(self.firstweekday, self.firstweekday + 7): + yield i%7 + + def itermonthdates(self, year, month): + """ + Return an iterator for one month. The iterator will yield datetime.date + values and will always iterate through complete weeks, so it will yield + dates outside the specified month. + """ + date = datetime.date(year, month, 1) + # Go back to the beginning of the week + days = (date.weekday() - self.firstweekday) % 7 + date -= datetime.timedelta(days=days) + oneday = datetime.timedelta(days=1) + while True: + yield date + try: + date += oneday + except OverflowError: + # Adding one day could fail after datetime.MAXYEAR + break + if date.month != month and date.weekday() == self.firstweekday: + break + + def itermonthdays2(self, year, month): + """ + Like itermonthdates(), but will yield (day number, weekday number) + tuples. For days outside the specified month the day number is 0. + """ + for i, d in enumerate(self.itermonthdays(year, month), self.firstweekday): + yield d, i % 7 + + def itermonthdays(self, year, month): + """ + Like itermonthdates(), but will yield day numbers. For days outside + the specified month the day number is 0. + """ + day1, ndays = monthrange(year, month) + days_before = (day1 - self.firstweekday) % 7 + yield from repeat(0, days_before) + yield from range(1, ndays + 1) + days_after = (self.firstweekday - day1 - ndays) % 7 + yield from repeat(0, days_after) + + def monthdatescalendar(self, year, month): + """ + Return a matrix (list of lists) representing a month's calendar. + Each row represents a week; week entries are datetime.date values. + """ + dates = list(self.itermonthdates(year, month)) + return [ dates[i:i+7] for i in range(0, len(dates), 7) ] + + def monthdays2calendar(self, year, month): + """ + Return a matrix representing a month's calendar. + Each row represents a week; week entries are + (day number, weekday number) tuples. Day numbers outside this month + are zero. + """ + days = list(self.itermonthdays2(year, month)) + return [ days[i:i+7] for i in range(0, len(days), 7) ] + + def monthdayscalendar(self, year, month): + """ + Return a matrix representing a month's calendar. + Each row represents a week; days outside this month are zero. + """ + days = list(self.itermonthdays(year, month)) + return [ days[i:i+7] for i in range(0, len(days), 7) ] + + def yeardatescalendar(self, year, width=3): + """ + Return the data for the specified year ready for formatting. The return + value is a list of month rows. Each month row contains up to width months. + Each month contains between 4 and 6 weeks and each week contains 1-7 + days. Days are datetime.date objects. + """ + months = [ + self.monthdatescalendar(year, i) + for i in range(January, January+12) + ] + return [months[i:i+width] for i in range(0, len(months), width) ] + + def yeardays2calendar(self, year, width=3): + """ + Return the data for the specified year ready for formatting (similar to + yeardatescalendar()). Entries in the week lists are + (day number, weekday number) tuples. Day numbers outside this month are + zero. + """ + months = [ + self.monthdays2calendar(year, i) + for i in range(January, January+12) + ] + return [months[i:i+width] for i in range(0, len(months), width) ] + + def yeardayscalendar(self, year, width=3): + """ + Return the data for the specified year ready for formatting (similar to + yeardatescalendar()). Entries in the week lists are day numbers. + Day numbers outside this month are zero. + """ + months = [ + self.monthdayscalendar(year, i) + for i in range(January, January+12) + ] + return [months[i:i+width] for i in range(0, len(months), width) ] + + +class TextCalendar(Calendar): + """ + Subclass of Calendar that outputs a calendar as a simple plain text + similar to the UNIX program cal. + """ + + def prweek(self, theweek, width): + """ + Print a single week (no newline). + """ + print(self.formatweek(theweek, width), end=' ') + + def formatday(self, day, weekday, width): + """ + Returns a formatted day. + """ + if day == 0: + s = '' + else: + s = '%2i' % day # right-align single-digit days + return s.center(width) + + def formatweek(self, theweek, width): + """ + Returns a single week in a string (no newline). + """ + return ' '.join(self.formatday(d, wd, width) for (d, wd) in theweek) + + def formatweekday(self, day, width): + """ + Returns a formatted week day name. + """ + if width >= 9: + names = day_name + else: + names = day_abbr + return names[day][:width].center(width) + + def formatweekheader(self, width): + """ + Return a header for a week. + """ + return ' '.join(self.formatweekday(i, width) for i in self.iterweekdays()) + + def formatmonthname(self, theyear, themonth, width, withyear=True): + """ + Return a formatted month name. + """ + s = month_name[themonth] + if withyear: + s = "%s %r" % (s, theyear) + return s.center(width) + + def prmonth(self, theyear, themonth, w=0, l=0): + """ + Print a month's calendar. + """ + print(self.formatmonth(theyear, themonth, w, l), end='') + + def formatmonth(self, theyear, themonth, w=0, l=0): + """ + Return a month's calendar string (multi-line). + """ + w = max(2, w) + l = max(1, l) + s = self.formatmonthname(theyear, themonth, 7 * (w + 1) - 1) + s = s.rstrip() + s += '\n' * l + s += self.formatweekheader(w).rstrip() + s += '\n' * l + for week in self.monthdays2calendar(theyear, themonth): + s += self.formatweek(week, w).rstrip() + s += '\n' * l + return s + + def formatyear(self, theyear, w=2, l=1, c=6, m=3): + """ + Returns a year's calendar as a multi-line string. + """ + w = max(2, w) + l = max(1, l) + c = max(2, c) + colwidth = (w + 1) * 7 - 1 + v = [] + a = v.append + a(repr(theyear).center(colwidth*m+c*(m-1)).rstrip()) + a('\n'*l) + header = self.formatweekheader(w) + for (i, row) in enumerate(self.yeardays2calendar(theyear, m)): + # months in this row + months = range(m*i+1, min(m*(i+1)+1, 13)) + a('\n'*l) + names = (self.formatmonthname(theyear, k, colwidth, False) + for k in months) + a(formatstring(names, colwidth, c).rstrip()) + a('\n'*l) + headers = (header for k in months) + a(formatstring(headers, colwidth, c).rstrip()) + a('\n'*l) + # max number of weeks for this row + height = max(len(cal) for cal in row) + for j in range(height): + weeks = [] + for cal in row: + if j >= len(cal): + weeks.append('') + else: + weeks.append(self.formatweek(cal[j], w)) + a(formatstring(weeks, colwidth, c).rstrip()) + a('\n' * l) + return ''.join(v) + + def pryear(self, theyear, w=0, l=0, c=6, m=3): + """Print a year's calendar.""" + print(self.formatyear(theyear, w, l, c, m)) + + +class HTMLCalendar(Calendar): + """ + This calendar returns complete HTML pages. + """ + + # CSS classes for the day s + cssclasses = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"] + + def formatday(self, day, weekday): + """ + Return a day as a table cell. + """ + if day == 0: + return ' ' # day outside month + else: + return '%d' % (self.cssclasses[weekday], day) + + def formatweek(self, theweek): + """ + Return a complete week as a table row. + """ + s = ''.join(self.formatday(d, wd) for (d, wd) in theweek) + return '%s' % s + + def formatweekday(self, day): + """ + Return a weekday name as a table header. + """ + return '%s' % (self.cssclasses[day], day_abbr[day]) + + def formatweekheader(self): + """ + Return a header for a week as a table row. + """ + s = ''.join(self.formatweekday(i) for i in self.iterweekdays()) + return '%s' % s + + def formatmonthname(self, theyear, themonth, withyear=True): + """ + Return a month name as a table row. + """ + if withyear: + s = '%s %s' % (month_name[themonth], theyear) + else: + s = '%s' % month_name[themonth] + return '%s' % s + + def formatmonth(self, theyear, themonth, withyear=True): + """ + Return a formatted month as a table. + """ + v = [] + a = v.append + a('') + a('\n') + a(self.formatmonthname(theyear, themonth, withyear=withyear)) + a('\n') + a(self.formatweekheader()) + a('\n') + for week in self.monthdays2calendar(theyear, themonth): + a(self.formatweek(week)) + a('\n') + a('
') + a('\n') + return ''.join(v) + + def formatyear(self, theyear, width=3): + """ + Return a formatted year as a table of tables. + """ + v = [] + a = v.append + width = max(width, 1) + a('') + a('\n') + a('' % (width, theyear)) + for i in range(January, January+12, width): + # months in this row + months = range(i, min(i+width, 13)) + a('') + for m in months: + a('') + a('') + a('
%s
') + a(self.formatmonth(theyear, m, withyear=False)) + a('
') + return ''.join(v) + + def formatyearpage(self, theyear, width=3, css='calendar.css', encoding=None): + """ + Return a formatted year as a complete HTML page. + """ + if encoding is None: + encoding = sys.getdefaultencoding() + v = [] + a = v.append + a('\n' % encoding) + a('\n') + a('\n') + a('\n') + a('\n' % encoding) + if css is not None: + a('\n' % css) + a('Calendar for %d\n' % theyear) + a('\n') + a('\n') + a(self.formatyear(theyear, width)) + a('\n') + a('\n') + return ''.join(v).encode(encoding, "xmlcharrefreplace") + + +class different_locale: + def __init__(self, locale): + self.locale = locale + + def __enter__(self): + self.oldlocale = _locale.getlocale(_locale.LC_TIME) + _locale.setlocale(_locale.LC_TIME, self.locale) + + def __exit__(self, *args): + _locale.setlocale(_locale.LC_TIME, self.oldlocale) + + +class LocaleTextCalendar(TextCalendar): + """ + This class can be passed a locale name in the constructor and will return + month and weekday names in the specified locale. If this locale includes + an encoding all strings containing month and weekday names will be returned + as unicode. + """ + + def __init__(self, firstweekday=0, locale=None): + TextCalendar.__init__(self, firstweekday) + if locale is None: + locale = _locale.getdefaultlocale() + self.locale = locale + + def formatweekday(self, day, width): + with different_locale(self.locale): + if width >= 9: + names = day_name + else: + names = day_abbr + name = names[day] + return name[:width].center(width) + + def formatmonthname(self, theyear, themonth, width, withyear=True): + with different_locale(self.locale): + s = month_name[themonth] + if withyear: + s = "%s %r" % (s, theyear) + return s.center(width) + + +class LocaleHTMLCalendar(HTMLCalendar): + """ + This class can be passed a locale name in the constructor and will return + month and weekday names in the specified locale. If this locale includes + an encoding all strings containing month and weekday names will be returned + as unicode. + """ + def __init__(self, firstweekday=0, locale=None): + HTMLCalendar.__init__(self, firstweekday) + if locale is None: + locale = _locale.getdefaultlocale() + self.locale = locale + + def formatweekday(self, day): + with different_locale(self.locale): + s = day_abbr[day] + return '%s' % (self.cssclasses[day], s) + + def formatmonthname(self, theyear, themonth, withyear=True): + with different_locale(self.locale): + s = month_name[themonth] + if withyear: + s = '%s %s' % (s, theyear) + return '%s' % s + + +# Support for old module level interface +c = TextCalendar() + +firstweekday = c.getfirstweekday + +def setfirstweekday(firstweekday): + if not MONDAY <= firstweekday <= SUNDAY: + raise IllegalWeekdayError(firstweekday) + c.firstweekday = firstweekday + +monthcalendar = c.monthdayscalendar +prweek = c.prweek +week = c.formatweek +weekheader = c.formatweekheader +prmonth = c.prmonth +month = c.formatmonth +calendar = c.formatyear +prcal = c.pryear + + +# Spacing of month columns for multi-column year calendar +_colwidth = 7*3 - 1 # Amount printed by prweek() +_spacing = 6 # Number of spaces between columns + + +def format(cols, colwidth=_colwidth, spacing=_spacing): + """Prints multi-column formatting for year calendars""" + print(formatstring(cols, colwidth, spacing)) + + +def formatstring(cols, colwidth=_colwidth, spacing=_spacing): + """Returns a string formatted from n strings, centered within n columns.""" + spacing *= ' ' + return spacing.join(c.center(colwidth) for c in cols) + + +EPOCH = 1970 +_EPOCH_ORD = datetime.date(EPOCH, 1, 1).toordinal() + + +def timegm(tuple): + """Unrelated but handy function to calculate Unix timestamp from GMT.""" + year, month, day, hour, minute, second = tuple[:6] + days = datetime.date(year, month, 1).toordinal() - _EPOCH_ORD + day - 1 + hours = days*24 + hour + minutes = hours*60 + minute + seconds = minutes*60 + second + return seconds + + +def main(args): + import argparse + parser = argparse.ArgumentParser() + textgroup = parser.add_argument_group('text only arguments') + htmlgroup = parser.add_argument_group('html only arguments') + textgroup.add_argument( + "-w", "--width", + type=int, default=2, + help="width of date column (default 2)" + ) + textgroup.add_argument( + "-l", "--lines", + type=int, default=1, + help="number of lines for each week (default 1)" + ) + textgroup.add_argument( + "-s", "--spacing", + type=int, default=6, + help="spacing between months (default 6)" + ) + textgroup.add_argument( + "-m", "--months", + type=int, default=3, + help="months per row (default 3)" + ) + htmlgroup.add_argument( + "-c", "--css", + default="calendar.css", + help="CSS to use for page" + ) + parser.add_argument( + "-L", "--locale", + default=None, + help="locale to be used from month and weekday names" + ) + parser.add_argument( + "-e", "--encoding", + default=None, + help="encoding to use for output" + ) + parser.add_argument( + "-t", "--type", + default="text", + choices=("text", "html"), + help="output type (text or html)" + ) + parser.add_argument( + "year", + nargs='?', type=int, + help="year number (1-9999)" + ) + parser.add_argument( + "month", + nargs='?', type=int, + help="month number (1-12, text only)" + ) + + options = parser.parse_args(args[1:]) + + if options.locale and not options.encoding: + parser.error("if --locale is specified --encoding is required") + sys.exit(1) + + locale = options.locale, options.encoding + + if options.type == "html": + if options.locale: + cal = LocaleHTMLCalendar(locale=locale) + else: + cal = HTMLCalendar() + encoding = options.encoding + if encoding is None: + encoding = sys.getdefaultencoding() + optdict = dict(encoding=encoding, css=options.css) + write = sys.stdout.buffer.write + if options.year is None: + write(cal.formatyearpage(datetime.date.today().year, **optdict)) + elif options.month is None: + write(cal.formatyearpage(options.year, **optdict)) + else: + parser.error("incorrect number of arguments") + sys.exit(1) + else: + if options.locale: + cal = LocaleTextCalendar(locale=locale) + else: + cal = TextCalendar() + optdict = dict(w=options.width, l=options.lines) + if options.month is None: + optdict["c"] = options.spacing + optdict["m"] = options.months + if options.year is None: + result = cal.formatyear(datetime.date.today().year, **optdict) + elif options.month is None: + result = cal.formatyear(options.year, **optdict) + else: + result = cal.formatmonth(options.year, options.month, **optdict) + write = sys.stdout.write + if options.encoding: + result = result.encode(options.encoding) + write = sys.stdout.buffer.write + write(result) + + +if __name__ == "__main__": + main(sys.argv) diff --git a/Lib/code.py b/Lib/code.py new file mode 100644 index 0000000000..23295f4cf5 --- /dev/null +++ b/Lib/code.py @@ -0,0 +1,314 @@ +"""Utilities needed to emulate Python's interactive interpreter. + +""" + +# Inspired by similar code by Jeff Epler and Fredrik Lundh. + + +import sys +import traceback +import argparse +from codeop import CommandCompiler, compile_command + +__all__ = ["InteractiveInterpreter", "InteractiveConsole", "interact", + "compile_command"] + +class InteractiveInterpreter: + """Base class for InteractiveConsole. + + This class deals with parsing and interpreter state (the user's + namespace); it doesn't deal with input buffering or prompting or + input file naming (the filename is always passed in explicitly). + + """ + + def __init__(self, locals=None): + """Constructor. + + The optional 'locals' argument specifies the dictionary in + which code will be executed; it defaults to a newly created + dictionary with key "__name__" set to "__console__" and key + "__doc__" set to None. + + """ + if locals is None: + locals = {"__name__": "__console__", "__doc__": None} + self.locals = locals + self.compile = CommandCompiler() + + def runsource(self, source, filename="", symbol="single"): + """Compile and run some source in the interpreter. + + Arguments are as for compile_command(). + + One several things can happen: + + 1) The input is incorrect; compile_command() raised an + exception (SyntaxError or OverflowError). A syntax traceback + will be printed by calling the showsyntaxerror() method. + + 2) The input is incomplete, and more input is required; + compile_command() returned None. Nothing happens. + + 3) The input is complete; compile_command() returned a code + object. The code is executed by calling self.runcode() (which + also handles run-time exceptions, except for SystemExit). + + The return value is True in case 2, False in the other cases (unless + an exception is raised). The return value can be used to + decide whether to use sys.ps1 or sys.ps2 to prompt the next + line. + + """ + try: + code = self.compile(source, filename, symbol) + except (OverflowError, SyntaxError, ValueError): + # Case 1 + self.showsyntaxerror(filename) + return False + + if code is None: + # Case 2 + return True + + # Case 3 + self.runcode(code) + return False + + def runcode(self, code): + """Execute a code object. + + When an exception occurs, self.showtraceback() is called to + display a traceback. All exceptions are caught except + SystemExit, which is reraised. + + A note about KeyboardInterrupt: this exception may occur + elsewhere in this code, and may not always be caught. The + caller should be prepared to deal with it. + + """ + try: + exec(code, self.locals) + except SystemExit: + raise + except: + self.showtraceback() + + def showsyntaxerror(self, filename=None): + """Display the syntax error that just occurred. + + This doesn't display a stack trace because there isn't one. + + If a filename is given, it is stuffed in the exception instead + of what was there before (because Python's parser always uses + "" when reading from a string). + + The output is written by self.write(), below. + + """ + type, value, tb = sys.exc_info() + sys.last_type = type + sys.last_value = value + sys.last_traceback = tb + if filename and type is SyntaxError: + # Work hard to stuff the correct filename in the exception + try: + msg, (dummy_filename, lineno, offset, line) = value.args + except ValueError: + # Not the format we expect; leave it alone + pass + else: + # Stuff in the right filename + value = SyntaxError(msg, (filename, lineno, offset, line)) + sys.last_value = value + if sys.excepthook is sys.__excepthook__: + lines = traceback.format_exception_only(type, value) + self.write(''.join(lines)) + else: + # If someone has set sys.excepthook, we let that take precedence + # over self.write + sys.excepthook(type, value, tb) + + def showtraceback(self): + """Display the exception that just occurred. + + We remove the first stack item because it is our own code. + + The output is written by self.write(), below. + + """ + sys.last_type, sys.last_value, last_tb = ei = sys.exc_info() + sys.last_traceback = last_tb + try: + lines = traceback.format_exception(ei[0], ei[1], last_tb.tb_next) + if sys.excepthook is sys.__excepthook__: + self.write(''.join(lines)) + else: + # If someone has set sys.excepthook, we let that take precedence + # over self.write + sys.excepthook(ei[0], ei[1], last_tb) + finally: + last_tb = ei = None + + def write(self, data): + """Write a string. + + The base implementation writes to sys.stderr; a subclass may + replace this with a different implementation. + + """ + sys.stderr.write(data) + + +class InteractiveConsole(InteractiveInterpreter): + """Closely emulate the behavior of the interactive Python interpreter. + + This class builds on InteractiveInterpreter and adds prompting + using the familiar sys.ps1 and sys.ps2, and input buffering. + + """ + + def __init__(self, locals=None, filename=""): + """Constructor. + + The optional locals argument will be passed to the + InteractiveInterpreter base class. + + The optional filename argument should specify the (file)name + of the input stream; it will show up in tracebacks. + + """ + InteractiveInterpreter.__init__(self, locals) + self.filename = filename + self.resetbuffer() + + def resetbuffer(self): + """Reset the input buffer.""" + self.buffer = [] + + def interact(self, banner=None, exitmsg=None): + """Closely emulate the interactive Python console. + + The optional banner argument specifies the banner to print + before the first interaction; by default it prints a banner + similar to the one printed by the real Python interpreter, + followed by the current class name in parentheses (so as not + to confuse this with the real interpreter -- since it's so + close!). + + The optional exitmsg argument specifies the exit message + printed when exiting. Pass the empty string to suppress + printing an exit message. If exitmsg is not given or None, + a default message is printed. + + """ + try: + sys.ps1 + except AttributeError: + sys.ps1 = ">>> " + try: + sys.ps2 + except AttributeError: + sys.ps2 = "... " + cprt = 'Type "help", "copyright", "credits" or "license" for more information.' + if banner is None: + self.write("Python %s on %s\n%s\n(%s)\n" % + (sys.version, sys.platform, cprt, + self.__class__.__name__)) + elif banner: + self.write("%s\n" % str(banner)) + more = 0 + while 1: + try: + if more: + prompt = sys.ps2 + else: + prompt = sys.ps1 + try: + line = self.raw_input(prompt) + except EOFError: + self.write("\n") + break + else: + more = self.push(line) + except KeyboardInterrupt: + self.write("\nKeyboardInterrupt\n") + self.resetbuffer() + more = 0 + if exitmsg is None: + self.write('now exiting %s...\n' % self.__class__.__name__) + elif exitmsg != '': + self.write('%s\n' % exitmsg) + + def push(self, line): + """Push a line to the interpreter. + + The line should not have a trailing newline; it may have + internal newlines. The line is appended to a buffer and the + interpreter's runsource() method is called with the + concatenated contents of the buffer as source. If this + indicates that the command was executed or invalid, the buffer + is reset; otherwise, the command is incomplete, and the buffer + is left as it was after the line was appended. The return + value is 1 if more input is required, 0 if the line was dealt + with in some way (this is the same as runsource()). + + """ + self.buffer.append(line) + source = "\n".join(self.buffer) + more = self.runsource(source, self.filename) + if not more: + self.resetbuffer() + return more + + def raw_input(self, prompt=""): + """Write a prompt and read a line. + + The returned line does not include the trailing newline. + When the user enters the EOF key sequence, EOFError is raised. + + The base implementation uses the built-in function + input(); a subclass may replace this with a different + implementation. + + """ + return input(prompt) + + + +def interact(banner=None, readfunc=None, local=None, exitmsg=None): + """Closely emulate the interactive Python interpreter. + + This is a backwards compatible interface to the InteractiveConsole + class. When readfunc is not specified, it attempts to import the + readline module to enable GNU readline if it is available. + + Arguments (all optional, all default to None): + + banner -- passed to InteractiveConsole.interact() + readfunc -- if not None, replaces InteractiveConsole.raw_input() + local -- passed to InteractiveInterpreter.__init__() + exitmsg -- passed to InteractiveConsole.interact() + + """ + console = InteractiveConsole(local) + if readfunc is not None: + console.raw_input = readfunc + else: + try: + import readline + except ImportError: + pass + console.interact(banner, exitmsg) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-q', action='store_true', + help="don't print version and copyright messages") + args = parser.parse_args() + if args.q or sys.flags.quiet: + banner = '' + else: + banner = None + interact(banner) diff --git a/Lib/getpass.py b/Lib/getpass.py new file mode 100644 index 0000000000..be51121158 --- /dev/null +++ b/Lib/getpass.py @@ -0,0 +1,186 @@ +"""Utilities to get a password and/or the current user name. + +getpass(prompt[, stream]) - Prompt for a password, with echo turned off. +getuser() - Get the user name from the environment or password database. + +GetPassWarning - This UserWarning is issued when getpass() cannot prevent + echoing of the password contents while reading. + +On Windows, the msvcrt module will be used. +On the Mac EasyDialogs.AskPassword is used, if available. + +""" + +# Authors: Piers Lauder (original) +# Guido van Rossum (Windows support and cleanup) +# Gregory P. Smith (tty support & GetPassWarning) + +import contextlib +import io +import os +import sys +import warnings + +__all__ = ["getpass","getuser","GetPassWarning"] + + +class GetPassWarning(UserWarning): pass + + +def unix_getpass(prompt='Password: ', stream=None): + """Prompt for a password, with echo turned off. + + Args: + prompt: Written on stream to ask for the input. Default: 'Password: ' + stream: A writable file object to display the prompt. Defaults to + the tty. If no tty is available defaults to sys.stderr. + Returns: + The seKr3t input. + Raises: + EOFError: If our input tty or stdin was closed. + GetPassWarning: When we were unable to turn echo off on the input. + + Always restores terminal settings before returning. + """ + passwd = None + with contextlib.ExitStack() as stack: + try: + # Always try reading and writing directly on the tty first. + fd = os.open('/dev/tty', os.O_RDWR|os.O_NOCTTY) + tty = io.FileIO(fd, 'w+') + stack.enter_context(tty) + input = io.TextIOWrapper(tty) + stack.enter_context(input) + if not stream: + stream = input + except OSError as e: + # If that fails, see if stdin can be controlled. + stack.close() + try: + fd = sys.stdin.fileno() + except (AttributeError, ValueError): + fd = None + passwd = fallback_getpass(prompt, stream) + input = sys.stdin + if not stream: + stream = sys.stderr + + if fd is not None: + try: + old = termios.tcgetattr(fd) # a copy to save + new = old[:] + new[3] &= ~termios.ECHO # 3 == 'lflags' + tcsetattr_flags = termios.TCSAFLUSH + if hasattr(termios, 'TCSASOFT'): + tcsetattr_flags |= termios.TCSASOFT + try: + termios.tcsetattr(fd, tcsetattr_flags, new) + passwd = _raw_input(prompt, stream, input=input) + finally: + termios.tcsetattr(fd, tcsetattr_flags, old) + stream.flush() # issue7208 + except termios.error: + if passwd is not None: + # _raw_input succeeded. The final tcsetattr failed. Reraise + # instead of leaving the terminal in an unknown state. + raise + # We can't control the tty or stdin. Give up and use normal IO. + # fallback_getpass() raises an appropriate warning. + if stream is not input: + # clean up unused file objects before blocking + stack.close() + passwd = fallback_getpass(prompt, stream) + + stream.write('\n') + return passwd + + +def win_getpass(prompt='Password: ', stream=None): + """Prompt for password with echo off, using Windows getch().""" + if sys.stdin is not sys.__stdin__: + return fallback_getpass(prompt, stream) + + for c in prompt: + msvcrt.putwch(c) + pw = "" + while 1: + c = msvcrt.getwch() + if c == '\r' or c == '\n': + break + if c == '\003': + raise KeyboardInterrupt + if c == '\b': + pw = pw[:-1] + else: + pw = pw + c + msvcrt.putwch('\r') + msvcrt.putwch('\n') + return pw + + +def fallback_getpass(prompt='Password: ', stream=None): + warnings.warn("Can not control echo on the terminal.", GetPassWarning, + stacklevel=2) + if not stream: + stream = sys.stderr + print("Warning: Password input may be echoed.", file=stream) + return _raw_input(prompt, stream) + + +def _raw_input(prompt="", stream=None, input=None): + # This doesn't save the string in the GNU readline history. + if not stream: + stream = sys.stderr + if not input: + input = sys.stdin + prompt = str(prompt) + if prompt: + try: + stream.write(prompt) + except UnicodeEncodeError: + # Use replace error handler to get as much as possible printed. + prompt = prompt.encode(stream.encoding, 'replace') + prompt = prompt.decode(stream.encoding) + stream.write(prompt) + stream.flush() + # NOTE: The Python C API calls flockfile() (and unlock) during readline. + line = input.readline() + if not line: + raise EOFError + if line[-1] == '\n': + line = line[:-1] + return line + + +def getuser(): + """Get the username from the environment or password database. + + First try various environment variables, then the password + database. This works on Windows as long as USERNAME is set. + + """ + + for name in ('LOGNAME', 'USER', 'LNAME', 'USERNAME'): + user = os.environ.get(name) + if user: + return user + + # If this fails, the exception will "explain" why + import pwd + return pwd.getpwuid(os.getuid())[0] + +# Bind the name getpass to the appropriate function +try: + import termios + # it's possible there is an incompatible termios from the + # McMillan Installer, make sure we have a UNIX-compatible termios + termios.tcgetattr, termios.tcsetattr +except (ImportError, AttributeError): + try: + import msvcrt + except ImportError: + getpass = fallback_getpass + else: + getpass = win_getpass +else: + getpass = unix_getpass diff --git a/Lib/hmac.py b/Lib/hmac.py new file mode 100644 index 0000000000..121029aa67 --- /dev/null +++ b/Lib/hmac.py @@ -0,0 +1,144 @@ +"""HMAC (Keyed-Hashing for Message Authentication) Python module. + +Implements the HMAC algorithm as described by RFC 2104. +""" + +import warnings as _warnings +from _operator import _compare_digest as compare_digest +import hashlib as _hashlib + +trans_5C = bytes((x ^ 0x5C) for x in range(256)) +trans_36 = bytes((x ^ 0x36) for x in range(256)) + +# The size of the digests returned by HMAC depends on the underlying +# hashing module used. Use digest_size from the instance of HMAC instead. +digest_size = None + + + +class HMAC: + """RFC 2104 HMAC class. Also complies with RFC 4231. + + This supports the API for Cryptographic Hash Functions (PEP 247). + """ + blocksize = 64 # 512-bit HMAC; can be changed in subclasses. + + def __init__(self, key, msg = None, digestmod = None): + """Create a new HMAC object. + + key: key for the keyed hash object. + msg: Initial input for the hash, if provided. + digestmod: A module supporting PEP 247. *OR* + A hashlib constructor returning a new hash object. *OR* + A hash name suitable for hashlib.new(). + Defaults to hashlib.md5. + Implicit default to hashlib.md5 is deprecated and will be + removed in Python 3.6. + + Note: key and msg must be a bytes or bytearray objects. + """ + + if not isinstance(key, (bytes, bytearray)): + raise TypeError("key: expected bytes or bytearray, but got %r" % type(key).__name__) + + if digestmod is None: + _warnings.warn("HMAC() without an explicit digestmod argument " + "is deprecated.", PendingDeprecationWarning, 2) + digestmod = _hashlib.md5 + + if callable(digestmod): + self.digest_cons = digestmod + elif isinstance(digestmod, str): + self.digest_cons = lambda d=b'': _hashlib.new(digestmod, d) + else: + self.digest_cons = lambda d=b'': digestmod.new(d) + + self.outer = self.digest_cons() + self.inner = self.digest_cons() + self.digest_size = self.inner.digest_size + + if hasattr(self.inner, 'block_size'): + blocksize = self.inner.block_size + if blocksize < 16: + _warnings.warn('block_size of %d seems too small; using our ' + 'default of %d.' % (blocksize, self.blocksize), + RuntimeWarning, 2) + blocksize = self.blocksize + else: + _warnings.warn('No block_size attribute on given digest object; ' + 'Assuming %d.' % (self.blocksize), + RuntimeWarning, 2) + blocksize = self.blocksize + + # self.blocksize is the default blocksize. self.block_size is + # effective block size as well as the public API attribute. + self.block_size = blocksize + + if len(key) > blocksize: + key = self.digest_cons(key).digest() + + key = key.ljust(blocksize, b'\0') + self.outer.update(key.translate(trans_5C)) + self.inner.update(key.translate(trans_36)) + if msg is not None: + self.update(msg) + + @property + def name(self): + return "hmac-" + self.inner.name + + def update(self, msg): + """Update this hashing object with the string msg. + """ + self.inner.update(msg) + + def copy(self): + """Return a separate copy of this hashing object. + + An update to this copy won't affect the original object. + """ + # Call __new__ directly to avoid the expensive __init__. + other = self.__class__.__new__(self.__class__) + other.digest_cons = self.digest_cons + other.digest_size = self.digest_size + other.inner = self.inner.copy() + other.outer = self.outer.copy() + return other + + def _current(self): + """Return a hash object for the current state. + + To be used only internally with digest() and hexdigest(). + """ + h = self.outer.copy() + h.update(self.inner.digest()) + return h + + def digest(self): + """Return the hash value of this hashing object. + + This returns a string containing 8-bit data. The object is + not altered in any way by this function; you can continue + updating the object after calling this function. + """ + h = self._current() + return h.digest() + + def hexdigest(self): + """Like digest(), but returns a string of hexadecimal digits instead. + """ + h = self._current() + return h.hexdigest() + +def new(key, msg = None, digestmod = None): + """Create a new hashing object and return it. + + key: The starting key for the hash. + msg: if available, will immediately be hashed into the object's starting + state. + + You can now feed arbitrary strings into the object using its update() + method, and can ask for the hash value at any time by calling its digest() + method. + """ + return HMAC(key, msg, digestmod) diff --git a/Lib/html/__init__.py b/Lib/html/__init__.py new file mode 100644 index 0000000000..da0a0a3ce7 --- /dev/null +++ b/Lib/html/__init__.py @@ -0,0 +1,132 @@ +""" +General functions for HTML manipulation. +""" + +import re as _re +from html.entities import html5 as _html5 + + +__all__ = ['escape', 'unescape'] + + +def escape(s, quote=True): + """ + Replace special characters "&", "<" and ">" to HTML-safe sequences. + If the optional flag quote is true (the default), the quotation mark + characters, both double quote (") and single quote (') characters are also + translated. + """ + s = s.replace("&", "&") # Must be done first! + s = s.replace("<", "<") + s = s.replace(">", ">") + if quote: + s = s.replace('"', """) + s = s.replace('\'', "'") + return s + + +# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references + +_invalid_charrefs = { + 0x00: '\ufffd', # REPLACEMENT CHARACTER + 0x0d: '\r', # CARRIAGE RETURN + 0x80: '\u20ac', # EURO SIGN + 0x81: '\x81', # + 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK + 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK + 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK + 0x85: '\u2026', # HORIZONTAL ELLIPSIS + 0x86: '\u2020', # DAGGER + 0x87: '\u2021', # DOUBLE DAGGER + 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT + 0x89: '\u2030', # PER MILLE SIGN + 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON + 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE + 0x8d: '\x8d', # + 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON + 0x8f: '\x8f', # + 0x90: '\x90', # + 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK + 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK + 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK + 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK + 0x95: '\u2022', # BULLET + 0x96: '\u2013', # EN DASH + 0x97: '\u2014', # EM DASH + 0x98: '\u02dc', # SMALL TILDE + 0x99: '\u2122', # TRADE MARK SIGN + 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON + 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x9c: '\u0153', # LATIN SMALL LIGATURE OE + 0x9d: '\x9d', # + 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON + 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS +} + +_invalid_codepoints = { + # 0x0001 to 0x0008 + 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + # 0x000E to 0x001F + 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + # 0x007F to 0x009F + 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, + 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + # 0xFDD0 to 0xFDEF + 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8, + 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1, + 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea, + 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, + # others + 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, + 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, + 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff, + 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff, + 0x10fffe, 0x10ffff +} + + +def _replace_charref(s): + s = s.group(1) + if s[0] == '#': + # numeric charref + if s[1] in 'xX': + num = int(s[2:].rstrip(';'), 16) + else: + num = int(s[1:].rstrip(';')) + if num in _invalid_charrefs: + return _invalid_charrefs[num] + if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: + return '\uFFFD' + if num in _invalid_codepoints: + return '' + return chr(num) + else: + # named charref + if s in _html5: + return _html5[s] + # find the longest matching name (as defined by the standard) + for x in range(len(s)-1, 1, -1): + if s[:x] in _html5: + return _html5[s[:x]] + s[x:] + else: + return '&' + s + + +_charref = _re.compile(r'&(#[0-9]+;?' + r'|#[xX][0-9a-fA-F]+;?' + r'|[^\t\n\f <&#;]{1,32};?)') + +def unescape(s): + """ + Convert all named and numeric character references (e.g. >, >, + &x3e;) in the string s to the corresponding unicode characters. + This function uses the rules defined by the HTML 5 standard + for both valid and invalid character references, and the list of + HTML 5 named character references defined in html.entities.html5. + """ + if '&' not in s: + return s + return _charref.sub(_replace_charref, s) diff --git a/Lib/html/entities.py b/Lib/html/entities.py new file mode 100644 index 0000000000..91ea5da2af --- /dev/null +++ b/Lib/html/entities.py @@ -0,0 +1,2509 @@ +"""HTML character entity references.""" + +__all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs'] + + +# maps the HTML entity name to the Unicode code point +name2codepoint = { + 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 + 'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 + 'Acirc': 0x00c2, # latin capital letter A with circumflex, U+00C2 ISOlat1 + 'Agrave': 0x00c0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 + 'Alpha': 0x0391, # greek capital letter alpha, U+0391 + 'Aring': 0x00c5, # latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 + 'Atilde': 0x00c3, # latin capital letter A with tilde, U+00C3 ISOlat1 + 'Auml': 0x00c4, # latin capital letter A with diaeresis, U+00C4 ISOlat1 + 'Beta': 0x0392, # greek capital letter beta, U+0392 + 'Ccedil': 0x00c7, # latin capital letter C with cedilla, U+00C7 ISOlat1 + 'Chi': 0x03a7, # greek capital letter chi, U+03A7 + 'Dagger': 0x2021, # double dagger, U+2021 ISOpub + 'Delta': 0x0394, # greek capital letter delta, U+0394 ISOgrk3 + 'ETH': 0x00d0, # latin capital letter ETH, U+00D0 ISOlat1 + 'Eacute': 0x00c9, # latin capital letter E with acute, U+00C9 ISOlat1 + 'Ecirc': 0x00ca, # latin capital letter E with circumflex, U+00CA ISOlat1 + 'Egrave': 0x00c8, # latin capital letter E with grave, U+00C8 ISOlat1 + 'Epsilon': 0x0395, # greek capital letter epsilon, U+0395 + 'Eta': 0x0397, # greek capital letter eta, U+0397 + 'Euml': 0x00cb, # latin capital letter E with diaeresis, U+00CB ISOlat1 + 'Gamma': 0x0393, # greek capital letter gamma, U+0393 ISOgrk3 + 'Iacute': 0x00cd, # latin capital letter I with acute, U+00CD ISOlat1 + 'Icirc': 0x00ce, # latin capital letter I with circumflex, U+00CE ISOlat1 + 'Igrave': 0x00cc, # latin capital letter I with grave, U+00CC ISOlat1 + 'Iota': 0x0399, # greek capital letter iota, U+0399 + 'Iuml': 0x00cf, # latin capital letter I with diaeresis, U+00CF ISOlat1 + 'Kappa': 0x039a, # greek capital letter kappa, U+039A + 'Lambda': 0x039b, # greek capital letter lambda, U+039B ISOgrk3 + 'Mu': 0x039c, # greek capital letter mu, U+039C + 'Ntilde': 0x00d1, # latin capital letter N with tilde, U+00D1 ISOlat1 + 'Nu': 0x039d, # greek capital letter nu, U+039D + 'OElig': 0x0152, # latin capital ligature OE, U+0152 ISOlat2 + 'Oacute': 0x00d3, # latin capital letter O with acute, U+00D3 ISOlat1 + 'Ocirc': 0x00d4, # latin capital letter O with circumflex, U+00D4 ISOlat1 + 'Ograve': 0x00d2, # latin capital letter O with grave, U+00D2 ISOlat1 + 'Omega': 0x03a9, # greek capital letter omega, U+03A9 ISOgrk3 + 'Omicron': 0x039f, # greek capital letter omicron, U+039F + 'Oslash': 0x00d8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 + 'Otilde': 0x00d5, # latin capital letter O with tilde, U+00D5 ISOlat1 + 'Ouml': 0x00d6, # latin capital letter O with diaeresis, U+00D6 ISOlat1 + 'Phi': 0x03a6, # greek capital letter phi, U+03A6 ISOgrk3 + 'Pi': 0x03a0, # greek capital letter pi, U+03A0 ISOgrk3 + 'Prime': 0x2033, # double prime = seconds = inches, U+2033 ISOtech + 'Psi': 0x03a8, # greek capital letter psi, U+03A8 ISOgrk3 + 'Rho': 0x03a1, # greek capital letter rho, U+03A1 + 'Scaron': 0x0160, # latin capital letter S with caron, U+0160 ISOlat2 + 'Sigma': 0x03a3, # greek capital letter sigma, U+03A3 ISOgrk3 + 'THORN': 0x00de, # latin capital letter THORN, U+00DE ISOlat1 + 'Tau': 0x03a4, # greek capital letter tau, U+03A4 + 'Theta': 0x0398, # greek capital letter theta, U+0398 ISOgrk3 + 'Uacute': 0x00da, # latin capital letter U with acute, U+00DA ISOlat1 + 'Ucirc': 0x00db, # latin capital letter U with circumflex, U+00DB ISOlat1 + 'Ugrave': 0x00d9, # latin capital letter U with grave, U+00D9 ISOlat1 + 'Upsilon': 0x03a5, # greek capital letter upsilon, U+03A5 ISOgrk3 + 'Uuml': 0x00dc, # latin capital letter U with diaeresis, U+00DC ISOlat1 + 'Xi': 0x039e, # greek capital letter xi, U+039E ISOgrk3 + 'Yacute': 0x00dd, # latin capital letter Y with acute, U+00DD ISOlat1 + 'Yuml': 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2 + 'Zeta': 0x0396, # greek capital letter zeta, U+0396 + 'aacute': 0x00e1, # latin small letter a with acute, U+00E1 ISOlat1 + 'acirc': 0x00e2, # latin small letter a with circumflex, U+00E2 ISOlat1 + 'acute': 0x00b4, # acute accent = spacing acute, U+00B4 ISOdia + 'aelig': 0x00e6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 + 'agrave': 0x00e0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 + 'alefsym': 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW + 'alpha': 0x03b1, # greek small letter alpha, U+03B1 ISOgrk3 + 'amp': 0x0026, # ampersand, U+0026 ISOnum + 'and': 0x2227, # logical and = wedge, U+2227 ISOtech + 'ang': 0x2220, # angle, U+2220 ISOamso + 'aring': 0x00e5, # latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 + 'asymp': 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr + 'atilde': 0x00e3, # latin small letter a with tilde, U+00E3 ISOlat1 + 'auml': 0x00e4, # latin small letter a with diaeresis, U+00E4 ISOlat1 + 'bdquo': 0x201e, # double low-9 quotation mark, U+201E NEW + 'beta': 0x03b2, # greek small letter beta, U+03B2 ISOgrk3 + 'brvbar': 0x00a6, # broken bar = broken vertical bar, U+00A6 ISOnum + 'bull': 0x2022, # bullet = black small circle, U+2022 ISOpub + 'cap': 0x2229, # intersection = cap, U+2229 ISOtech + 'ccedil': 0x00e7, # latin small letter c with cedilla, U+00E7 ISOlat1 + 'cedil': 0x00b8, # cedilla = spacing cedilla, U+00B8 ISOdia + 'cent': 0x00a2, # cent sign, U+00A2 ISOnum + 'chi': 0x03c7, # greek small letter chi, U+03C7 ISOgrk3 + 'circ': 0x02c6, # modifier letter circumflex accent, U+02C6 ISOpub + 'clubs': 0x2663, # black club suit = shamrock, U+2663 ISOpub + 'cong': 0x2245, # approximately equal to, U+2245 ISOtech + 'copy': 0x00a9, # copyright sign, U+00A9 ISOnum + 'crarr': 0x21b5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW + 'cup': 0x222a, # union = cup, U+222A ISOtech + 'curren': 0x00a4, # currency sign, U+00A4 ISOnum + 'dArr': 0x21d3, # downwards double arrow, U+21D3 ISOamsa + 'dagger': 0x2020, # dagger, U+2020 ISOpub + 'darr': 0x2193, # downwards arrow, U+2193 ISOnum + 'deg': 0x00b0, # degree sign, U+00B0 ISOnum + 'delta': 0x03b4, # greek small letter delta, U+03B4 ISOgrk3 + 'diams': 0x2666, # black diamond suit, U+2666 ISOpub + 'divide': 0x00f7, # division sign, U+00F7 ISOnum + 'eacute': 0x00e9, # latin small letter e with acute, U+00E9 ISOlat1 + 'ecirc': 0x00ea, # latin small letter e with circumflex, U+00EA ISOlat1 + 'egrave': 0x00e8, # latin small letter e with grave, U+00E8 ISOlat1 + 'empty': 0x2205, # empty set = null set = diameter, U+2205 ISOamso + 'emsp': 0x2003, # em space, U+2003 ISOpub + 'ensp': 0x2002, # en space, U+2002 ISOpub + 'epsilon': 0x03b5, # greek small letter epsilon, U+03B5 ISOgrk3 + 'equiv': 0x2261, # identical to, U+2261 ISOtech + 'eta': 0x03b7, # greek small letter eta, U+03B7 ISOgrk3 + 'eth': 0x00f0, # latin small letter eth, U+00F0 ISOlat1 + 'euml': 0x00eb, # latin small letter e with diaeresis, U+00EB ISOlat1 + 'euro': 0x20ac, # euro sign, U+20AC NEW + 'exist': 0x2203, # there exists, U+2203 ISOtech + 'fnof': 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech + 'forall': 0x2200, # for all, U+2200 ISOtech + 'frac12': 0x00bd, # vulgar fraction one half = fraction one half, U+00BD ISOnum + 'frac14': 0x00bc, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum + 'frac34': 0x00be, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum + 'frasl': 0x2044, # fraction slash, U+2044 NEW + 'gamma': 0x03b3, # greek small letter gamma, U+03B3 ISOgrk3 + 'ge': 0x2265, # greater-than or equal to, U+2265 ISOtech + 'gt': 0x003e, # greater-than sign, U+003E ISOnum + 'hArr': 0x21d4, # left right double arrow, U+21D4 ISOamsa + 'harr': 0x2194, # left right arrow, U+2194 ISOamsa + 'hearts': 0x2665, # black heart suit = valentine, U+2665 ISOpub + 'hellip': 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub + 'iacute': 0x00ed, # latin small letter i with acute, U+00ED ISOlat1 + 'icirc': 0x00ee, # latin small letter i with circumflex, U+00EE ISOlat1 + 'iexcl': 0x00a1, # inverted exclamation mark, U+00A1 ISOnum + 'igrave': 0x00ec, # latin small letter i with grave, U+00EC ISOlat1 + 'image': 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso + 'infin': 0x221e, # infinity, U+221E ISOtech + 'int': 0x222b, # integral, U+222B ISOtech + 'iota': 0x03b9, # greek small letter iota, U+03B9 ISOgrk3 + 'iquest': 0x00bf, # inverted question mark = turned question mark, U+00BF ISOnum + 'isin': 0x2208, # element of, U+2208 ISOtech + 'iuml': 0x00ef, # latin small letter i with diaeresis, U+00EF ISOlat1 + 'kappa': 0x03ba, # greek small letter kappa, U+03BA ISOgrk3 + 'lArr': 0x21d0, # leftwards double arrow, U+21D0 ISOtech + 'lambda': 0x03bb, # greek small letter lambda, U+03BB ISOgrk3 + 'lang': 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech + 'laquo': 0x00ab, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum + 'larr': 0x2190, # leftwards arrow, U+2190 ISOnum + 'lceil': 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc + 'ldquo': 0x201c, # left double quotation mark, U+201C ISOnum + 'le': 0x2264, # less-than or equal to, U+2264 ISOtech + 'lfloor': 0x230a, # left floor = apl downstile, U+230A ISOamsc + 'lowast': 0x2217, # asterisk operator, U+2217 ISOtech + 'loz': 0x25ca, # lozenge, U+25CA ISOpub + 'lrm': 0x200e, # left-to-right mark, U+200E NEW RFC 2070 + 'lsaquo': 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed + 'lsquo': 0x2018, # left single quotation mark, U+2018 ISOnum + 'lt': 0x003c, # less-than sign, U+003C ISOnum + 'macr': 0x00af, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia + 'mdash': 0x2014, # em dash, U+2014 ISOpub + 'micro': 0x00b5, # micro sign, U+00B5 ISOnum + 'middot': 0x00b7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum + 'minus': 0x2212, # minus sign, U+2212 ISOtech + 'mu': 0x03bc, # greek small letter mu, U+03BC ISOgrk3 + 'nabla': 0x2207, # nabla = backward difference, U+2207 ISOtech + 'nbsp': 0x00a0, # no-break space = non-breaking space, U+00A0 ISOnum + 'ndash': 0x2013, # en dash, U+2013 ISOpub + 'ne': 0x2260, # not equal to, U+2260 ISOtech + 'ni': 0x220b, # contains as member, U+220B ISOtech + 'not': 0x00ac, # not sign, U+00AC ISOnum + 'notin': 0x2209, # not an element of, U+2209 ISOtech + 'nsub': 0x2284, # not a subset of, U+2284 ISOamsn + 'ntilde': 0x00f1, # latin small letter n with tilde, U+00F1 ISOlat1 + 'nu': 0x03bd, # greek small letter nu, U+03BD ISOgrk3 + 'oacute': 0x00f3, # latin small letter o with acute, U+00F3 ISOlat1 + 'ocirc': 0x00f4, # latin small letter o with circumflex, U+00F4 ISOlat1 + 'oelig': 0x0153, # latin small ligature oe, U+0153 ISOlat2 + 'ograve': 0x00f2, # latin small letter o with grave, U+00F2 ISOlat1 + 'oline': 0x203e, # overline = spacing overscore, U+203E NEW + 'omega': 0x03c9, # greek small letter omega, U+03C9 ISOgrk3 + 'omicron': 0x03bf, # greek small letter omicron, U+03BF NEW + 'oplus': 0x2295, # circled plus = direct sum, U+2295 ISOamsb + 'or': 0x2228, # logical or = vee, U+2228 ISOtech + 'ordf': 0x00aa, # feminine ordinal indicator, U+00AA ISOnum + 'ordm': 0x00ba, # masculine ordinal indicator, U+00BA ISOnum + 'oslash': 0x00f8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 + 'otilde': 0x00f5, # latin small letter o with tilde, U+00F5 ISOlat1 + 'otimes': 0x2297, # circled times = vector product, U+2297 ISOamsb + 'ouml': 0x00f6, # latin small letter o with diaeresis, U+00F6 ISOlat1 + 'para': 0x00b6, # pilcrow sign = paragraph sign, U+00B6 ISOnum + 'part': 0x2202, # partial differential, U+2202 ISOtech + 'permil': 0x2030, # per mille sign, U+2030 ISOtech + 'perp': 0x22a5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech + 'phi': 0x03c6, # greek small letter phi, U+03C6 ISOgrk3 + 'pi': 0x03c0, # greek small letter pi, U+03C0 ISOgrk3 + 'piv': 0x03d6, # greek pi symbol, U+03D6 ISOgrk3 + 'plusmn': 0x00b1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum + 'pound': 0x00a3, # pound sign, U+00A3 ISOnum + 'prime': 0x2032, # prime = minutes = feet, U+2032 ISOtech + 'prod': 0x220f, # n-ary product = product sign, U+220F ISOamsb + 'prop': 0x221d, # proportional to, U+221D ISOtech + 'psi': 0x03c8, # greek small letter psi, U+03C8 ISOgrk3 + 'quot': 0x0022, # quotation mark = APL quote, U+0022 ISOnum + 'rArr': 0x21d2, # rightwards double arrow, U+21D2 ISOtech + 'radic': 0x221a, # square root = radical sign, U+221A ISOtech + 'rang': 0x232a, # right-pointing angle bracket = ket, U+232A ISOtech + 'raquo': 0x00bb, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum + 'rarr': 0x2192, # rightwards arrow, U+2192 ISOnum + 'rceil': 0x2309, # right ceiling, U+2309 ISOamsc + 'rdquo': 0x201d, # right double quotation mark, U+201D ISOnum + 'real': 0x211c, # blackletter capital R = real part symbol, U+211C ISOamso + 'reg': 0x00ae, # registered sign = registered trade mark sign, U+00AE ISOnum + 'rfloor': 0x230b, # right floor, U+230B ISOamsc + 'rho': 0x03c1, # greek small letter rho, U+03C1 ISOgrk3 + 'rlm': 0x200f, # right-to-left mark, U+200F NEW RFC 2070 + 'rsaquo': 0x203a, # single right-pointing angle quotation mark, U+203A ISO proposed + 'rsquo': 0x2019, # right single quotation mark, U+2019 ISOnum + 'sbquo': 0x201a, # single low-9 quotation mark, U+201A NEW + 'scaron': 0x0161, # latin small letter s with caron, U+0161 ISOlat2 + 'sdot': 0x22c5, # dot operator, U+22C5 ISOamsb + 'sect': 0x00a7, # section sign, U+00A7 ISOnum + 'shy': 0x00ad, # soft hyphen = discretionary hyphen, U+00AD ISOnum + 'sigma': 0x03c3, # greek small letter sigma, U+03C3 ISOgrk3 + 'sigmaf': 0x03c2, # greek small letter final sigma, U+03C2 ISOgrk3 + 'sim': 0x223c, # tilde operator = varies with = similar to, U+223C ISOtech + 'spades': 0x2660, # black spade suit, U+2660 ISOpub + 'sub': 0x2282, # subset of, U+2282 ISOtech + 'sube': 0x2286, # subset of or equal to, U+2286 ISOtech + 'sum': 0x2211, # n-ary summation, U+2211 ISOamsb + 'sup': 0x2283, # superset of, U+2283 ISOtech + 'sup1': 0x00b9, # superscript one = superscript digit one, U+00B9 ISOnum + 'sup2': 0x00b2, # superscript two = superscript digit two = squared, U+00B2 ISOnum + 'sup3': 0x00b3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum + 'supe': 0x2287, # superset of or equal to, U+2287 ISOtech + 'szlig': 0x00df, # latin small letter sharp s = ess-zed, U+00DF ISOlat1 + 'tau': 0x03c4, # greek small letter tau, U+03C4 ISOgrk3 + 'there4': 0x2234, # therefore, U+2234 ISOtech + 'theta': 0x03b8, # greek small letter theta, U+03B8 ISOgrk3 + 'thetasym': 0x03d1, # greek small letter theta symbol, U+03D1 NEW + 'thinsp': 0x2009, # thin space, U+2009 ISOpub + 'thorn': 0x00fe, # latin small letter thorn with, U+00FE ISOlat1 + 'tilde': 0x02dc, # small tilde, U+02DC ISOdia + 'times': 0x00d7, # multiplication sign, U+00D7 ISOnum + 'trade': 0x2122, # trade mark sign, U+2122 ISOnum + 'uArr': 0x21d1, # upwards double arrow, U+21D1 ISOamsa + 'uacute': 0x00fa, # latin small letter u with acute, U+00FA ISOlat1 + 'uarr': 0x2191, # upwards arrow, U+2191 ISOnum + 'ucirc': 0x00fb, # latin small letter u with circumflex, U+00FB ISOlat1 + 'ugrave': 0x00f9, # latin small letter u with grave, U+00F9 ISOlat1 + 'uml': 0x00a8, # diaeresis = spacing diaeresis, U+00A8 ISOdia + 'upsih': 0x03d2, # greek upsilon with hook symbol, U+03D2 NEW + 'upsilon': 0x03c5, # greek small letter upsilon, U+03C5 ISOgrk3 + 'uuml': 0x00fc, # latin small letter u with diaeresis, U+00FC ISOlat1 + 'weierp': 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso + 'xi': 0x03be, # greek small letter xi, U+03BE ISOgrk3 + 'yacute': 0x00fd, # latin small letter y with acute, U+00FD ISOlat1 + 'yen': 0x00a5, # yen sign = yuan sign, U+00A5 ISOnum + 'yuml': 0x00ff, # latin small letter y with diaeresis, U+00FF ISOlat1 + 'zeta': 0x03b6, # greek small letter zeta, U+03B6 ISOgrk3 + 'zwj': 0x200d, # zero width joiner, U+200D NEW RFC 2070 + 'zwnj': 0x200c, # zero width non-joiner, U+200C NEW RFC 2070 +} + + +# maps the HTML5 named character references to the equivalent Unicode character(s) +html5 = { + 'Aacute': '\xc1', + 'aacute': '\xe1', + 'Aacute;': '\xc1', + 'aacute;': '\xe1', + 'Abreve;': '\u0102', + 'abreve;': '\u0103', + 'ac;': '\u223e', + 'acd;': '\u223f', + 'acE;': '\u223e\u0333', + 'Acirc': '\xc2', + 'acirc': '\xe2', + 'Acirc;': '\xc2', + 'acirc;': '\xe2', + 'acute': '\xb4', + 'acute;': '\xb4', + 'Acy;': '\u0410', + 'acy;': '\u0430', + 'AElig': '\xc6', + 'aelig': '\xe6', + 'AElig;': '\xc6', + 'aelig;': '\xe6', + 'af;': '\u2061', + 'Afr;': '\U0001d504', + 'afr;': '\U0001d51e', + 'Agrave': '\xc0', + 'agrave': '\xe0', + 'Agrave;': '\xc0', + 'agrave;': '\xe0', + 'alefsym;': '\u2135', + 'aleph;': '\u2135', + 'Alpha;': '\u0391', + 'alpha;': '\u03b1', + 'Amacr;': '\u0100', + 'amacr;': '\u0101', + 'amalg;': '\u2a3f', + 'AMP': '&', + 'amp': '&', + 'AMP;': '&', + 'amp;': '&', + 'And;': '\u2a53', + 'and;': '\u2227', + 'andand;': '\u2a55', + 'andd;': '\u2a5c', + 'andslope;': '\u2a58', + 'andv;': '\u2a5a', + 'ang;': '\u2220', + 'ange;': '\u29a4', + 'angle;': '\u2220', + 'angmsd;': '\u2221', + 'angmsdaa;': '\u29a8', + 'angmsdab;': '\u29a9', + 'angmsdac;': '\u29aa', + 'angmsdad;': '\u29ab', + 'angmsdae;': '\u29ac', + 'angmsdaf;': '\u29ad', + 'angmsdag;': '\u29ae', + 'angmsdah;': '\u29af', + 'angrt;': '\u221f', + 'angrtvb;': '\u22be', + 'angrtvbd;': '\u299d', + 'angsph;': '\u2222', + 'angst;': '\xc5', + 'angzarr;': '\u237c', + 'Aogon;': '\u0104', + 'aogon;': '\u0105', + 'Aopf;': '\U0001d538', + 'aopf;': '\U0001d552', + 'ap;': '\u2248', + 'apacir;': '\u2a6f', + 'apE;': '\u2a70', + 'ape;': '\u224a', + 'apid;': '\u224b', + 'apos;': "'", + 'ApplyFunction;': '\u2061', + 'approx;': '\u2248', + 'approxeq;': '\u224a', + 'Aring': '\xc5', + 'aring': '\xe5', + 'Aring;': '\xc5', + 'aring;': '\xe5', + 'Ascr;': '\U0001d49c', + 'ascr;': '\U0001d4b6', + 'Assign;': '\u2254', + 'ast;': '*', + 'asymp;': '\u2248', + 'asympeq;': '\u224d', + 'Atilde': '\xc3', + 'atilde': '\xe3', + 'Atilde;': '\xc3', + 'atilde;': '\xe3', + 'Auml': '\xc4', + 'auml': '\xe4', + 'Auml;': '\xc4', + 'auml;': '\xe4', + 'awconint;': '\u2233', + 'awint;': '\u2a11', + 'backcong;': '\u224c', + 'backepsilon;': '\u03f6', + 'backprime;': '\u2035', + 'backsim;': '\u223d', + 'backsimeq;': '\u22cd', + 'Backslash;': '\u2216', + 'Barv;': '\u2ae7', + 'barvee;': '\u22bd', + 'Barwed;': '\u2306', + 'barwed;': '\u2305', + 'barwedge;': '\u2305', + 'bbrk;': '\u23b5', + 'bbrktbrk;': '\u23b6', + 'bcong;': '\u224c', + 'Bcy;': '\u0411', + 'bcy;': '\u0431', + 'bdquo;': '\u201e', + 'becaus;': '\u2235', + 'Because;': '\u2235', + 'because;': '\u2235', + 'bemptyv;': '\u29b0', + 'bepsi;': '\u03f6', + 'bernou;': '\u212c', + 'Bernoullis;': '\u212c', + 'Beta;': '\u0392', + 'beta;': '\u03b2', + 'beth;': '\u2136', + 'between;': '\u226c', + 'Bfr;': '\U0001d505', + 'bfr;': '\U0001d51f', + 'bigcap;': '\u22c2', + 'bigcirc;': '\u25ef', + 'bigcup;': '\u22c3', + 'bigodot;': '\u2a00', + 'bigoplus;': '\u2a01', + 'bigotimes;': '\u2a02', + 'bigsqcup;': '\u2a06', + 'bigstar;': '\u2605', + 'bigtriangledown;': '\u25bd', + 'bigtriangleup;': '\u25b3', + 'biguplus;': '\u2a04', + 'bigvee;': '\u22c1', + 'bigwedge;': '\u22c0', + 'bkarow;': '\u290d', + 'blacklozenge;': '\u29eb', + 'blacksquare;': '\u25aa', + 'blacktriangle;': '\u25b4', + 'blacktriangledown;': '\u25be', + 'blacktriangleleft;': '\u25c2', + 'blacktriangleright;': '\u25b8', + 'blank;': '\u2423', + 'blk12;': '\u2592', + 'blk14;': '\u2591', + 'blk34;': '\u2593', + 'block;': '\u2588', + 'bne;': '=\u20e5', + 'bnequiv;': '\u2261\u20e5', + 'bNot;': '\u2aed', + 'bnot;': '\u2310', + 'Bopf;': '\U0001d539', + 'bopf;': '\U0001d553', + 'bot;': '\u22a5', + 'bottom;': '\u22a5', + 'bowtie;': '\u22c8', + 'boxbox;': '\u29c9', + 'boxDL;': '\u2557', + 'boxDl;': '\u2556', + 'boxdL;': '\u2555', + 'boxdl;': '\u2510', + 'boxDR;': '\u2554', + 'boxDr;': '\u2553', + 'boxdR;': '\u2552', + 'boxdr;': '\u250c', + 'boxH;': '\u2550', + 'boxh;': '\u2500', + 'boxHD;': '\u2566', + 'boxHd;': '\u2564', + 'boxhD;': '\u2565', + 'boxhd;': '\u252c', + 'boxHU;': '\u2569', + 'boxHu;': '\u2567', + 'boxhU;': '\u2568', + 'boxhu;': '\u2534', + 'boxminus;': '\u229f', + 'boxplus;': '\u229e', + 'boxtimes;': '\u22a0', + 'boxUL;': '\u255d', + 'boxUl;': '\u255c', + 'boxuL;': '\u255b', + 'boxul;': '\u2518', + 'boxUR;': '\u255a', + 'boxUr;': '\u2559', + 'boxuR;': '\u2558', + 'boxur;': '\u2514', + 'boxV;': '\u2551', + 'boxv;': '\u2502', + 'boxVH;': '\u256c', + 'boxVh;': '\u256b', + 'boxvH;': '\u256a', + 'boxvh;': '\u253c', + 'boxVL;': '\u2563', + 'boxVl;': '\u2562', + 'boxvL;': '\u2561', + 'boxvl;': '\u2524', + 'boxVR;': '\u2560', + 'boxVr;': '\u255f', + 'boxvR;': '\u255e', + 'boxvr;': '\u251c', + 'bprime;': '\u2035', + 'Breve;': '\u02d8', + 'breve;': '\u02d8', + 'brvbar': '\xa6', + 'brvbar;': '\xa6', + 'Bscr;': '\u212c', + 'bscr;': '\U0001d4b7', + 'bsemi;': '\u204f', + 'bsim;': '\u223d', + 'bsime;': '\u22cd', + 'bsol;': '\\', + 'bsolb;': '\u29c5', + 'bsolhsub;': '\u27c8', + 'bull;': '\u2022', + 'bullet;': '\u2022', + 'bump;': '\u224e', + 'bumpE;': '\u2aae', + 'bumpe;': '\u224f', + 'Bumpeq;': '\u224e', + 'bumpeq;': '\u224f', + 'Cacute;': '\u0106', + 'cacute;': '\u0107', + 'Cap;': '\u22d2', + 'cap;': '\u2229', + 'capand;': '\u2a44', + 'capbrcup;': '\u2a49', + 'capcap;': '\u2a4b', + 'capcup;': '\u2a47', + 'capdot;': '\u2a40', + 'CapitalDifferentialD;': '\u2145', + 'caps;': '\u2229\ufe00', + 'caret;': '\u2041', + 'caron;': '\u02c7', + 'Cayleys;': '\u212d', + 'ccaps;': '\u2a4d', + 'Ccaron;': '\u010c', + 'ccaron;': '\u010d', + 'Ccedil': '\xc7', + 'ccedil': '\xe7', + 'Ccedil;': '\xc7', + 'ccedil;': '\xe7', + 'Ccirc;': '\u0108', + 'ccirc;': '\u0109', + 'Cconint;': '\u2230', + 'ccups;': '\u2a4c', + 'ccupssm;': '\u2a50', + 'Cdot;': '\u010a', + 'cdot;': '\u010b', + 'cedil': '\xb8', + 'cedil;': '\xb8', + 'Cedilla;': '\xb8', + 'cemptyv;': '\u29b2', + 'cent': '\xa2', + 'cent;': '\xa2', + 'CenterDot;': '\xb7', + 'centerdot;': '\xb7', + 'Cfr;': '\u212d', + 'cfr;': '\U0001d520', + 'CHcy;': '\u0427', + 'chcy;': '\u0447', + 'check;': '\u2713', + 'checkmark;': '\u2713', + 'Chi;': '\u03a7', + 'chi;': '\u03c7', + 'cir;': '\u25cb', + 'circ;': '\u02c6', + 'circeq;': '\u2257', + 'circlearrowleft;': '\u21ba', + 'circlearrowright;': '\u21bb', + 'circledast;': '\u229b', + 'circledcirc;': '\u229a', + 'circleddash;': '\u229d', + 'CircleDot;': '\u2299', + 'circledR;': '\xae', + 'circledS;': '\u24c8', + 'CircleMinus;': '\u2296', + 'CirclePlus;': '\u2295', + 'CircleTimes;': '\u2297', + 'cirE;': '\u29c3', + 'cire;': '\u2257', + 'cirfnint;': '\u2a10', + 'cirmid;': '\u2aef', + 'cirscir;': '\u29c2', + 'ClockwiseContourIntegral;': '\u2232', + 'CloseCurlyDoubleQuote;': '\u201d', + 'CloseCurlyQuote;': '\u2019', + 'clubs;': '\u2663', + 'clubsuit;': '\u2663', + 'Colon;': '\u2237', + 'colon;': ':', + 'Colone;': '\u2a74', + 'colone;': '\u2254', + 'coloneq;': '\u2254', + 'comma;': ',', + 'commat;': '@', + 'comp;': '\u2201', + 'compfn;': '\u2218', + 'complement;': '\u2201', + 'complexes;': '\u2102', + 'cong;': '\u2245', + 'congdot;': '\u2a6d', + 'Congruent;': '\u2261', + 'Conint;': '\u222f', + 'conint;': '\u222e', + 'ContourIntegral;': '\u222e', + 'Copf;': '\u2102', + 'copf;': '\U0001d554', + 'coprod;': '\u2210', + 'Coproduct;': '\u2210', + 'COPY': '\xa9', + 'copy': '\xa9', + 'COPY;': '\xa9', + 'copy;': '\xa9', + 'copysr;': '\u2117', + 'CounterClockwiseContourIntegral;': '\u2233', + 'crarr;': '\u21b5', + 'Cross;': '\u2a2f', + 'cross;': '\u2717', + 'Cscr;': '\U0001d49e', + 'cscr;': '\U0001d4b8', + 'csub;': '\u2acf', + 'csube;': '\u2ad1', + 'csup;': '\u2ad0', + 'csupe;': '\u2ad2', + 'ctdot;': '\u22ef', + 'cudarrl;': '\u2938', + 'cudarrr;': '\u2935', + 'cuepr;': '\u22de', + 'cuesc;': '\u22df', + 'cularr;': '\u21b6', + 'cularrp;': '\u293d', + 'Cup;': '\u22d3', + 'cup;': '\u222a', + 'cupbrcap;': '\u2a48', + 'CupCap;': '\u224d', + 'cupcap;': '\u2a46', + 'cupcup;': '\u2a4a', + 'cupdot;': '\u228d', + 'cupor;': '\u2a45', + 'cups;': '\u222a\ufe00', + 'curarr;': '\u21b7', + 'curarrm;': '\u293c', + 'curlyeqprec;': '\u22de', + 'curlyeqsucc;': '\u22df', + 'curlyvee;': '\u22ce', + 'curlywedge;': '\u22cf', + 'curren': '\xa4', + 'curren;': '\xa4', + 'curvearrowleft;': '\u21b6', + 'curvearrowright;': '\u21b7', + 'cuvee;': '\u22ce', + 'cuwed;': '\u22cf', + 'cwconint;': '\u2232', + 'cwint;': '\u2231', + 'cylcty;': '\u232d', + 'Dagger;': '\u2021', + 'dagger;': '\u2020', + 'daleth;': '\u2138', + 'Darr;': '\u21a1', + 'dArr;': '\u21d3', + 'darr;': '\u2193', + 'dash;': '\u2010', + 'Dashv;': '\u2ae4', + 'dashv;': '\u22a3', + 'dbkarow;': '\u290f', + 'dblac;': '\u02dd', + 'Dcaron;': '\u010e', + 'dcaron;': '\u010f', + 'Dcy;': '\u0414', + 'dcy;': '\u0434', + 'DD;': '\u2145', + 'dd;': '\u2146', + 'ddagger;': '\u2021', + 'ddarr;': '\u21ca', + 'DDotrahd;': '\u2911', + 'ddotseq;': '\u2a77', + 'deg': '\xb0', + 'deg;': '\xb0', + 'Del;': '\u2207', + 'Delta;': '\u0394', + 'delta;': '\u03b4', + 'demptyv;': '\u29b1', + 'dfisht;': '\u297f', + 'Dfr;': '\U0001d507', + 'dfr;': '\U0001d521', + 'dHar;': '\u2965', + 'dharl;': '\u21c3', + 'dharr;': '\u21c2', + 'DiacriticalAcute;': '\xb4', + 'DiacriticalDot;': '\u02d9', + 'DiacriticalDoubleAcute;': '\u02dd', + 'DiacriticalGrave;': '`', + 'DiacriticalTilde;': '\u02dc', + 'diam;': '\u22c4', + 'Diamond;': '\u22c4', + 'diamond;': '\u22c4', + 'diamondsuit;': '\u2666', + 'diams;': '\u2666', + 'die;': '\xa8', + 'DifferentialD;': '\u2146', + 'digamma;': '\u03dd', + 'disin;': '\u22f2', + 'div;': '\xf7', + 'divide': '\xf7', + 'divide;': '\xf7', + 'divideontimes;': '\u22c7', + 'divonx;': '\u22c7', + 'DJcy;': '\u0402', + 'djcy;': '\u0452', + 'dlcorn;': '\u231e', + 'dlcrop;': '\u230d', + 'dollar;': '$', + 'Dopf;': '\U0001d53b', + 'dopf;': '\U0001d555', + 'Dot;': '\xa8', + 'dot;': '\u02d9', + 'DotDot;': '\u20dc', + 'doteq;': '\u2250', + 'doteqdot;': '\u2251', + 'DotEqual;': '\u2250', + 'dotminus;': '\u2238', + 'dotplus;': '\u2214', + 'dotsquare;': '\u22a1', + 'doublebarwedge;': '\u2306', + 'DoubleContourIntegral;': '\u222f', + 'DoubleDot;': '\xa8', + 'DoubleDownArrow;': '\u21d3', + 'DoubleLeftArrow;': '\u21d0', + 'DoubleLeftRightArrow;': '\u21d4', + 'DoubleLeftTee;': '\u2ae4', + 'DoubleLongLeftArrow;': '\u27f8', + 'DoubleLongLeftRightArrow;': '\u27fa', + 'DoubleLongRightArrow;': '\u27f9', + 'DoubleRightArrow;': '\u21d2', + 'DoubleRightTee;': '\u22a8', + 'DoubleUpArrow;': '\u21d1', + 'DoubleUpDownArrow;': '\u21d5', + 'DoubleVerticalBar;': '\u2225', + 'DownArrow;': '\u2193', + 'Downarrow;': '\u21d3', + 'downarrow;': '\u2193', + 'DownArrowBar;': '\u2913', + 'DownArrowUpArrow;': '\u21f5', + 'DownBreve;': '\u0311', + 'downdownarrows;': '\u21ca', + 'downharpoonleft;': '\u21c3', + 'downharpoonright;': '\u21c2', + 'DownLeftRightVector;': '\u2950', + 'DownLeftTeeVector;': '\u295e', + 'DownLeftVector;': '\u21bd', + 'DownLeftVectorBar;': '\u2956', + 'DownRightTeeVector;': '\u295f', + 'DownRightVector;': '\u21c1', + 'DownRightVectorBar;': '\u2957', + 'DownTee;': '\u22a4', + 'DownTeeArrow;': '\u21a7', + 'drbkarow;': '\u2910', + 'drcorn;': '\u231f', + 'drcrop;': '\u230c', + 'Dscr;': '\U0001d49f', + 'dscr;': '\U0001d4b9', + 'DScy;': '\u0405', + 'dscy;': '\u0455', + 'dsol;': '\u29f6', + 'Dstrok;': '\u0110', + 'dstrok;': '\u0111', + 'dtdot;': '\u22f1', + 'dtri;': '\u25bf', + 'dtrif;': '\u25be', + 'duarr;': '\u21f5', + 'duhar;': '\u296f', + 'dwangle;': '\u29a6', + 'DZcy;': '\u040f', + 'dzcy;': '\u045f', + 'dzigrarr;': '\u27ff', + 'Eacute': '\xc9', + 'eacute': '\xe9', + 'Eacute;': '\xc9', + 'eacute;': '\xe9', + 'easter;': '\u2a6e', + 'Ecaron;': '\u011a', + 'ecaron;': '\u011b', + 'ecir;': '\u2256', + 'Ecirc': '\xca', + 'ecirc': '\xea', + 'Ecirc;': '\xca', + 'ecirc;': '\xea', + 'ecolon;': '\u2255', + 'Ecy;': '\u042d', + 'ecy;': '\u044d', + 'eDDot;': '\u2a77', + 'Edot;': '\u0116', + 'eDot;': '\u2251', + 'edot;': '\u0117', + 'ee;': '\u2147', + 'efDot;': '\u2252', + 'Efr;': '\U0001d508', + 'efr;': '\U0001d522', + 'eg;': '\u2a9a', + 'Egrave': '\xc8', + 'egrave': '\xe8', + 'Egrave;': '\xc8', + 'egrave;': '\xe8', + 'egs;': '\u2a96', + 'egsdot;': '\u2a98', + 'el;': '\u2a99', + 'Element;': '\u2208', + 'elinters;': '\u23e7', + 'ell;': '\u2113', + 'els;': '\u2a95', + 'elsdot;': '\u2a97', + 'Emacr;': '\u0112', + 'emacr;': '\u0113', + 'empty;': '\u2205', + 'emptyset;': '\u2205', + 'EmptySmallSquare;': '\u25fb', + 'emptyv;': '\u2205', + 'EmptyVerySmallSquare;': '\u25ab', + 'emsp13;': '\u2004', + 'emsp14;': '\u2005', + 'emsp;': '\u2003', + 'ENG;': '\u014a', + 'eng;': '\u014b', + 'ensp;': '\u2002', + 'Eogon;': '\u0118', + 'eogon;': '\u0119', + 'Eopf;': '\U0001d53c', + 'eopf;': '\U0001d556', + 'epar;': '\u22d5', + 'eparsl;': '\u29e3', + 'eplus;': '\u2a71', + 'epsi;': '\u03b5', + 'Epsilon;': '\u0395', + 'epsilon;': '\u03b5', + 'epsiv;': '\u03f5', + 'eqcirc;': '\u2256', + 'eqcolon;': '\u2255', + 'eqsim;': '\u2242', + 'eqslantgtr;': '\u2a96', + 'eqslantless;': '\u2a95', + 'Equal;': '\u2a75', + 'equals;': '=', + 'EqualTilde;': '\u2242', + 'equest;': '\u225f', + 'Equilibrium;': '\u21cc', + 'equiv;': '\u2261', + 'equivDD;': '\u2a78', + 'eqvparsl;': '\u29e5', + 'erarr;': '\u2971', + 'erDot;': '\u2253', + 'Escr;': '\u2130', + 'escr;': '\u212f', + 'esdot;': '\u2250', + 'Esim;': '\u2a73', + 'esim;': '\u2242', + 'Eta;': '\u0397', + 'eta;': '\u03b7', + 'ETH': '\xd0', + 'eth': '\xf0', + 'ETH;': '\xd0', + 'eth;': '\xf0', + 'Euml': '\xcb', + 'euml': '\xeb', + 'Euml;': '\xcb', + 'euml;': '\xeb', + 'euro;': '\u20ac', + 'excl;': '!', + 'exist;': '\u2203', + 'Exists;': '\u2203', + 'expectation;': '\u2130', + 'ExponentialE;': '\u2147', + 'exponentiale;': '\u2147', + 'fallingdotseq;': '\u2252', + 'Fcy;': '\u0424', + 'fcy;': '\u0444', + 'female;': '\u2640', + 'ffilig;': '\ufb03', + 'fflig;': '\ufb00', + 'ffllig;': '\ufb04', + 'Ffr;': '\U0001d509', + 'ffr;': '\U0001d523', + 'filig;': '\ufb01', + 'FilledSmallSquare;': '\u25fc', + 'FilledVerySmallSquare;': '\u25aa', + 'fjlig;': 'fj', + 'flat;': '\u266d', + 'fllig;': '\ufb02', + 'fltns;': '\u25b1', + 'fnof;': '\u0192', + 'Fopf;': '\U0001d53d', + 'fopf;': '\U0001d557', + 'ForAll;': '\u2200', + 'forall;': '\u2200', + 'fork;': '\u22d4', + 'forkv;': '\u2ad9', + 'Fouriertrf;': '\u2131', + 'fpartint;': '\u2a0d', + 'frac12': '\xbd', + 'frac12;': '\xbd', + 'frac13;': '\u2153', + 'frac14': '\xbc', + 'frac14;': '\xbc', + 'frac15;': '\u2155', + 'frac16;': '\u2159', + 'frac18;': '\u215b', + 'frac23;': '\u2154', + 'frac25;': '\u2156', + 'frac34': '\xbe', + 'frac34;': '\xbe', + 'frac35;': '\u2157', + 'frac38;': '\u215c', + 'frac45;': '\u2158', + 'frac56;': '\u215a', + 'frac58;': '\u215d', + 'frac78;': '\u215e', + 'frasl;': '\u2044', + 'frown;': '\u2322', + 'Fscr;': '\u2131', + 'fscr;': '\U0001d4bb', + 'gacute;': '\u01f5', + 'Gamma;': '\u0393', + 'gamma;': '\u03b3', + 'Gammad;': '\u03dc', + 'gammad;': '\u03dd', + 'gap;': '\u2a86', + 'Gbreve;': '\u011e', + 'gbreve;': '\u011f', + 'Gcedil;': '\u0122', + 'Gcirc;': '\u011c', + 'gcirc;': '\u011d', + 'Gcy;': '\u0413', + 'gcy;': '\u0433', + 'Gdot;': '\u0120', + 'gdot;': '\u0121', + 'gE;': '\u2267', + 'ge;': '\u2265', + 'gEl;': '\u2a8c', + 'gel;': '\u22db', + 'geq;': '\u2265', + 'geqq;': '\u2267', + 'geqslant;': '\u2a7e', + 'ges;': '\u2a7e', + 'gescc;': '\u2aa9', + 'gesdot;': '\u2a80', + 'gesdoto;': '\u2a82', + 'gesdotol;': '\u2a84', + 'gesl;': '\u22db\ufe00', + 'gesles;': '\u2a94', + 'Gfr;': '\U0001d50a', + 'gfr;': '\U0001d524', + 'Gg;': '\u22d9', + 'gg;': '\u226b', + 'ggg;': '\u22d9', + 'gimel;': '\u2137', + 'GJcy;': '\u0403', + 'gjcy;': '\u0453', + 'gl;': '\u2277', + 'gla;': '\u2aa5', + 'glE;': '\u2a92', + 'glj;': '\u2aa4', + 'gnap;': '\u2a8a', + 'gnapprox;': '\u2a8a', + 'gnE;': '\u2269', + 'gne;': '\u2a88', + 'gneq;': '\u2a88', + 'gneqq;': '\u2269', + 'gnsim;': '\u22e7', + 'Gopf;': '\U0001d53e', + 'gopf;': '\U0001d558', + 'grave;': '`', + 'GreaterEqual;': '\u2265', + 'GreaterEqualLess;': '\u22db', + 'GreaterFullEqual;': '\u2267', + 'GreaterGreater;': '\u2aa2', + 'GreaterLess;': '\u2277', + 'GreaterSlantEqual;': '\u2a7e', + 'GreaterTilde;': '\u2273', + 'Gscr;': '\U0001d4a2', + 'gscr;': '\u210a', + 'gsim;': '\u2273', + 'gsime;': '\u2a8e', + 'gsiml;': '\u2a90', + 'GT': '>', + 'gt': '>', + 'GT;': '>', + 'Gt;': '\u226b', + 'gt;': '>', + 'gtcc;': '\u2aa7', + 'gtcir;': '\u2a7a', + 'gtdot;': '\u22d7', + 'gtlPar;': '\u2995', + 'gtquest;': '\u2a7c', + 'gtrapprox;': '\u2a86', + 'gtrarr;': '\u2978', + 'gtrdot;': '\u22d7', + 'gtreqless;': '\u22db', + 'gtreqqless;': '\u2a8c', + 'gtrless;': '\u2277', + 'gtrsim;': '\u2273', + 'gvertneqq;': '\u2269\ufe00', + 'gvnE;': '\u2269\ufe00', + 'Hacek;': '\u02c7', + 'hairsp;': '\u200a', + 'half;': '\xbd', + 'hamilt;': '\u210b', + 'HARDcy;': '\u042a', + 'hardcy;': '\u044a', + 'hArr;': '\u21d4', + 'harr;': '\u2194', + 'harrcir;': '\u2948', + 'harrw;': '\u21ad', + 'Hat;': '^', + 'hbar;': '\u210f', + 'Hcirc;': '\u0124', + 'hcirc;': '\u0125', + 'hearts;': '\u2665', + 'heartsuit;': '\u2665', + 'hellip;': '\u2026', + 'hercon;': '\u22b9', + 'Hfr;': '\u210c', + 'hfr;': '\U0001d525', + 'HilbertSpace;': '\u210b', + 'hksearow;': '\u2925', + 'hkswarow;': '\u2926', + 'hoarr;': '\u21ff', + 'homtht;': '\u223b', + 'hookleftarrow;': '\u21a9', + 'hookrightarrow;': '\u21aa', + 'Hopf;': '\u210d', + 'hopf;': '\U0001d559', + 'horbar;': '\u2015', + 'HorizontalLine;': '\u2500', + 'Hscr;': '\u210b', + 'hscr;': '\U0001d4bd', + 'hslash;': '\u210f', + 'Hstrok;': '\u0126', + 'hstrok;': '\u0127', + 'HumpDownHump;': '\u224e', + 'HumpEqual;': '\u224f', + 'hybull;': '\u2043', + 'hyphen;': '\u2010', + 'Iacute': '\xcd', + 'iacute': '\xed', + 'Iacute;': '\xcd', + 'iacute;': '\xed', + 'ic;': '\u2063', + 'Icirc': '\xce', + 'icirc': '\xee', + 'Icirc;': '\xce', + 'icirc;': '\xee', + 'Icy;': '\u0418', + 'icy;': '\u0438', + 'Idot;': '\u0130', + 'IEcy;': '\u0415', + 'iecy;': '\u0435', + 'iexcl': '\xa1', + 'iexcl;': '\xa1', + 'iff;': '\u21d4', + 'Ifr;': '\u2111', + 'ifr;': '\U0001d526', + 'Igrave': '\xcc', + 'igrave': '\xec', + 'Igrave;': '\xcc', + 'igrave;': '\xec', + 'ii;': '\u2148', + 'iiiint;': '\u2a0c', + 'iiint;': '\u222d', + 'iinfin;': '\u29dc', + 'iiota;': '\u2129', + 'IJlig;': '\u0132', + 'ijlig;': '\u0133', + 'Im;': '\u2111', + 'Imacr;': '\u012a', + 'imacr;': '\u012b', + 'image;': '\u2111', + 'ImaginaryI;': '\u2148', + 'imagline;': '\u2110', + 'imagpart;': '\u2111', + 'imath;': '\u0131', + 'imof;': '\u22b7', + 'imped;': '\u01b5', + 'Implies;': '\u21d2', + 'in;': '\u2208', + 'incare;': '\u2105', + 'infin;': '\u221e', + 'infintie;': '\u29dd', + 'inodot;': '\u0131', + 'Int;': '\u222c', + 'int;': '\u222b', + 'intcal;': '\u22ba', + 'integers;': '\u2124', + 'Integral;': '\u222b', + 'intercal;': '\u22ba', + 'Intersection;': '\u22c2', + 'intlarhk;': '\u2a17', + 'intprod;': '\u2a3c', + 'InvisibleComma;': '\u2063', + 'InvisibleTimes;': '\u2062', + 'IOcy;': '\u0401', + 'iocy;': '\u0451', + 'Iogon;': '\u012e', + 'iogon;': '\u012f', + 'Iopf;': '\U0001d540', + 'iopf;': '\U0001d55a', + 'Iota;': '\u0399', + 'iota;': '\u03b9', + 'iprod;': '\u2a3c', + 'iquest': '\xbf', + 'iquest;': '\xbf', + 'Iscr;': '\u2110', + 'iscr;': '\U0001d4be', + 'isin;': '\u2208', + 'isindot;': '\u22f5', + 'isinE;': '\u22f9', + 'isins;': '\u22f4', + 'isinsv;': '\u22f3', + 'isinv;': '\u2208', + 'it;': '\u2062', + 'Itilde;': '\u0128', + 'itilde;': '\u0129', + 'Iukcy;': '\u0406', + 'iukcy;': '\u0456', + 'Iuml': '\xcf', + 'iuml': '\xef', + 'Iuml;': '\xcf', + 'iuml;': '\xef', + 'Jcirc;': '\u0134', + 'jcirc;': '\u0135', + 'Jcy;': '\u0419', + 'jcy;': '\u0439', + 'Jfr;': '\U0001d50d', + 'jfr;': '\U0001d527', + 'jmath;': '\u0237', + 'Jopf;': '\U0001d541', + 'jopf;': '\U0001d55b', + 'Jscr;': '\U0001d4a5', + 'jscr;': '\U0001d4bf', + 'Jsercy;': '\u0408', + 'jsercy;': '\u0458', + 'Jukcy;': '\u0404', + 'jukcy;': '\u0454', + 'Kappa;': '\u039a', + 'kappa;': '\u03ba', + 'kappav;': '\u03f0', + 'Kcedil;': '\u0136', + 'kcedil;': '\u0137', + 'Kcy;': '\u041a', + 'kcy;': '\u043a', + 'Kfr;': '\U0001d50e', + 'kfr;': '\U0001d528', + 'kgreen;': '\u0138', + 'KHcy;': '\u0425', + 'khcy;': '\u0445', + 'KJcy;': '\u040c', + 'kjcy;': '\u045c', + 'Kopf;': '\U0001d542', + 'kopf;': '\U0001d55c', + 'Kscr;': '\U0001d4a6', + 'kscr;': '\U0001d4c0', + 'lAarr;': '\u21da', + 'Lacute;': '\u0139', + 'lacute;': '\u013a', + 'laemptyv;': '\u29b4', + 'lagran;': '\u2112', + 'Lambda;': '\u039b', + 'lambda;': '\u03bb', + 'Lang;': '\u27ea', + 'lang;': '\u27e8', + 'langd;': '\u2991', + 'langle;': '\u27e8', + 'lap;': '\u2a85', + 'Laplacetrf;': '\u2112', + 'laquo': '\xab', + 'laquo;': '\xab', + 'Larr;': '\u219e', + 'lArr;': '\u21d0', + 'larr;': '\u2190', + 'larrb;': '\u21e4', + 'larrbfs;': '\u291f', + 'larrfs;': '\u291d', + 'larrhk;': '\u21a9', + 'larrlp;': '\u21ab', + 'larrpl;': '\u2939', + 'larrsim;': '\u2973', + 'larrtl;': '\u21a2', + 'lat;': '\u2aab', + 'lAtail;': '\u291b', + 'latail;': '\u2919', + 'late;': '\u2aad', + 'lates;': '\u2aad\ufe00', + 'lBarr;': '\u290e', + 'lbarr;': '\u290c', + 'lbbrk;': '\u2772', + 'lbrace;': '{', + 'lbrack;': '[', + 'lbrke;': '\u298b', + 'lbrksld;': '\u298f', + 'lbrkslu;': '\u298d', + 'Lcaron;': '\u013d', + 'lcaron;': '\u013e', + 'Lcedil;': '\u013b', + 'lcedil;': '\u013c', + 'lceil;': '\u2308', + 'lcub;': '{', + 'Lcy;': '\u041b', + 'lcy;': '\u043b', + 'ldca;': '\u2936', + 'ldquo;': '\u201c', + 'ldquor;': '\u201e', + 'ldrdhar;': '\u2967', + 'ldrushar;': '\u294b', + 'ldsh;': '\u21b2', + 'lE;': '\u2266', + 'le;': '\u2264', + 'LeftAngleBracket;': '\u27e8', + 'LeftArrow;': '\u2190', + 'Leftarrow;': '\u21d0', + 'leftarrow;': '\u2190', + 'LeftArrowBar;': '\u21e4', + 'LeftArrowRightArrow;': '\u21c6', + 'leftarrowtail;': '\u21a2', + 'LeftCeiling;': '\u2308', + 'LeftDoubleBracket;': '\u27e6', + 'LeftDownTeeVector;': '\u2961', + 'LeftDownVector;': '\u21c3', + 'LeftDownVectorBar;': '\u2959', + 'LeftFloor;': '\u230a', + 'leftharpoondown;': '\u21bd', + 'leftharpoonup;': '\u21bc', + 'leftleftarrows;': '\u21c7', + 'LeftRightArrow;': '\u2194', + 'Leftrightarrow;': '\u21d4', + 'leftrightarrow;': '\u2194', + 'leftrightarrows;': '\u21c6', + 'leftrightharpoons;': '\u21cb', + 'leftrightsquigarrow;': '\u21ad', + 'LeftRightVector;': '\u294e', + 'LeftTee;': '\u22a3', + 'LeftTeeArrow;': '\u21a4', + 'LeftTeeVector;': '\u295a', + 'leftthreetimes;': '\u22cb', + 'LeftTriangle;': '\u22b2', + 'LeftTriangleBar;': '\u29cf', + 'LeftTriangleEqual;': '\u22b4', + 'LeftUpDownVector;': '\u2951', + 'LeftUpTeeVector;': '\u2960', + 'LeftUpVector;': '\u21bf', + 'LeftUpVectorBar;': '\u2958', + 'LeftVector;': '\u21bc', + 'LeftVectorBar;': '\u2952', + 'lEg;': '\u2a8b', + 'leg;': '\u22da', + 'leq;': '\u2264', + 'leqq;': '\u2266', + 'leqslant;': '\u2a7d', + 'les;': '\u2a7d', + 'lescc;': '\u2aa8', + 'lesdot;': '\u2a7f', + 'lesdoto;': '\u2a81', + 'lesdotor;': '\u2a83', + 'lesg;': '\u22da\ufe00', + 'lesges;': '\u2a93', + 'lessapprox;': '\u2a85', + 'lessdot;': '\u22d6', + 'lesseqgtr;': '\u22da', + 'lesseqqgtr;': '\u2a8b', + 'LessEqualGreater;': '\u22da', + 'LessFullEqual;': '\u2266', + 'LessGreater;': '\u2276', + 'lessgtr;': '\u2276', + 'LessLess;': '\u2aa1', + 'lesssim;': '\u2272', + 'LessSlantEqual;': '\u2a7d', + 'LessTilde;': '\u2272', + 'lfisht;': '\u297c', + 'lfloor;': '\u230a', + 'Lfr;': '\U0001d50f', + 'lfr;': '\U0001d529', + 'lg;': '\u2276', + 'lgE;': '\u2a91', + 'lHar;': '\u2962', + 'lhard;': '\u21bd', + 'lharu;': '\u21bc', + 'lharul;': '\u296a', + 'lhblk;': '\u2584', + 'LJcy;': '\u0409', + 'ljcy;': '\u0459', + 'Ll;': '\u22d8', + 'll;': '\u226a', + 'llarr;': '\u21c7', + 'llcorner;': '\u231e', + 'Lleftarrow;': '\u21da', + 'llhard;': '\u296b', + 'lltri;': '\u25fa', + 'Lmidot;': '\u013f', + 'lmidot;': '\u0140', + 'lmoust;': '\u23b0', + 'lmoustache;': '\u23b0', + 'lnap;': '\u2a89', + 'lnapprox;': '\u2a89', + 'lnE;': '\u2268', + 'lne;': '\u2a87', + 'lneq;': '\u2a87', + 'lneqq;': '\u2268', + 'lnsim;': '\u22e6', + 'loang;': '\u27ec', + 'loarr;': '\u21fd', + 'lobrk;': '\u27e6', + 'LongLeftArrow;': '\u27f5', + 'Longleftarrow;': '\u27f8', + 'longleftarrow;': '\u27f5', + 'LongLeftRightArrow;': '\u27f7', + 'Longleftrightarrow;': '\u27fa', + 'longleftrightarrow;': '\u27f7', + 'longmapsto;': '\u27fc', + 'LongRightArrow;': '\u27f6', + 'Longrightarrow;': '\u27f9', + 'longrightarrow;': '\u27f6', + 'looparrowleft;': '\u21ab', + 'looparrowright;': '\u21ac', + 'lopar;': '\u2985', + 'Lopf;': '\U0001d543', + 'lopf;': '\U0001d55d', + 'loplus;': '\u2a2d', + 'lotimes;': '\u2a34', + 'lowast;': '\u2217', + 'lowbar;': '_', + 'LowerLeftArrow;': '\u2199', + 'LowerRightArrow;': '\u2198', + 'loz;': '\u25ca', + 'lozenge;': '\u25ca', + 'lozf;': '\u29eb', + 'lpar;': '(', + 'lparlt;': '\u2993', + 'lrarr;': '\u21c6', + 'lrcorner;': '\u231f', + 'lrhar;': '\u21cb', + 'lrhard;': '\u296d', + 'lrm;': '\u200e', + 'lrtri;': '\u22bf', + 'lsaquo;': '\u2039', + 'Lscr;': '\u2112', + 'lscr;': '\U0001d4c1', + 'Lsh;': '\u21b0', + 'lsh;': '\u21b0', + 'lsim;': '\u2272', + 'lsime;': '\u2a8d', + 'lsimg;': '\u2a8f', + 'lsqb;': '[', + 'lsquo;': '\u2018', + 'lsquor;': '\u201a', + 'Lstrok;': '\u0141', + 'lstrok;': '\u0142', + 'LT': '<', + 'lt': '<', + 'LT;': '<', + 'Lt;': '\u226a', + 'lt;': '<', + 'ltcc;': '\u2aa6', + 'ltcir;': '\u2a79', + 'ltdot;': '\u22d6', + 'lthree;': '\u22cb', + 'ltimes;': '\u22c9', + 'ltlarr;': '\u2976', + 'ltquest;': '\u2a7b', + 'ltri;': '\u25c3', + 'ltrie;': '\u22b4', + 'ltrif;': '\u25c2', + 'ltrPar;': '\u2996', + 'lurdshar;': '\u294a', + 'luruhar;': '\u2966', + 'lvertneqq;': '\u2268\ufe00', + 'lvnE;': '\u2268\ufe00', + 'macr': '\xaf', + 'macr;': '\xaf', + 'male;': '\u2642', + 'malt;': '\u2720', + 'maltese;': '\u2720', + 'Map;': '\u2905', + 'map;': '\u21a6', + 'mapsto;': '\u21a6', + 'mapstodown;': '\u21a7', + 'mapstoleft;': '\u21a4', + 'mapstoup;': '\u21a5', + 'marker;': '\u25ae', + 'mcomma;': '\u2a29', + 'Mcy;': '\u041c', + 'mcy;': '\u043c', + 'mdash;': '\u2014', + 'mDDot;': '\u223a', + 'measuredangle;': '\u2221', + 'MediumSpace;': '\u205f', + 'Mellintrf;': '\u2133', + 'Mfr;': '\U0001d510', + 'mfr;': '\U0001d52a', + 'mho;': '\u2127', + 'micro': '\xb5', + 'micro;': '\xb5', + 'mid;': '\u2223', + 'midast;': '*', + 'midcir;': '\u2af0', + 'middot': '\xb7', + 'middot;': '\xb7', + 'minus;': '\u2212', + 'minusb;': '\u229f', + 'minusd;': '\u2238', + 'minusdu;': '\u2a2a', + 'MinusPlus;': '\u2213', + 'mlcp;': '\u2adb', + 'mldr;': '\u2026', + 'mnplus;': '\u2213', + 'models;': '\u22a7', + 'Mopf;': '\U0001d544', + 'mopf;': '\U0001d55e', + 'mp;': '\u2213', + 'Mscr;': '\u2133', + 'mscr;': '\U0001d4c2', + 'mstpos;': '\u223e', + 'Mu;': '\u039c', + 'mu;': '\u03bc', + 'multimap;': '\u22b8', + 'mumap;': '\u22b8', + 'nabla;': '\u2207', + 'Nacute;': '\u0143', + 'nacute;': '\u0144', + 'nang;': '\u2220\u20d2', + 'nap;': '\u2249', + 'napE;': '\u2a70\u0338', + 'napid;': '\u224b\u0338', + 'napos;': '\u0149', + 'napprox;': '\u2249', + 'natur;': '\u266e', + 'natural;': '\u266e', + 'naturals;': '\u2115', + 'nbsp': '\xa0', + 'nbsp;': '\xa0', + 'nbump;': '\u224e\u0338', + 'nbumpe;': '\u224f\u0338', + 'ncap;': '\u2a43', + 'Ncaron;': '\u0147', + 'ncaron;': '\u0148', + 'Ncedil;': '\u0145', + 'ncedil;': '\u0146', + 'ncong;': '\u2247', + 'ncongdot;': '\u2a6d\u0338', + 'ncup;': '\u2a42', + 'Ncy;': '\u041d', + 'ncy;': '\u043d', + 'ndash;': '\u2013', + 'ne;': '\u2260', + 'nearhk;': '\u2924', + 'neArr;': '\u21d7', + 'nearr;': '\u2197', + 'nearrow;': '\u2197', + 'nedot;': '\u2250\u0338', + 'NegativeMediumSpace;': '\u200b', + 'NegativeThickSpace;': '\u200b', + 'NegativeThinSpace;': '\u200b', + 'NegativeVeryThinSpace;': '\u200b', + 'nequiv;': '\u2262', + 'nesear;': '\u2928', + 'nesim;': '\u2242\u0338', + 'NestedGreaterGreater;': '\u226b', + 'NestedLessLess;': '\u226a', + 'NewLine;': '\n', + 'nexist;': '\u2204', + 'nexists;': '\u2204', + 'Nfr;': '\U0001d511', + 'nfr;': '\U0001d52b', + 'ngE;': '\u2267\u0338', + 'nge;': '\u2271', + 'ngeq;': '\u2271', + 'ngeqq;': '\u2267\u0338', + 'ngeqslant;': '\u2a7e\u0338', + 'nges;': '\u2a7e\u0338', + 'nGg;': '\u22d9\u0338', + 'ngsim;': '\u2275', + 'nGt;': '\u226b\u20d2', + 'ngt;': '\u226f', + 'ngtr;': '\u226f', + 'nGtv;': '\u226b\u0338', + 'nhArr;': '\u21ce', + 'nharr;': '\u21ae', + 'nhpar;': '\u2af2', + 'ni;': '\u220b', + 'nis;': '\u22fc', + 'nisd;': '\u22fa', + 'niv;': '\u220b', + 'NJcy;': '\u040a', + 'njcy;': '\u045a', + 'nlArr;': '\u21cd', + 'nlarr;': '\u219a', + 'nldr;': '\u2025', + 'nlE;': '\u2266\u0338', + 'nle;': '\u2270', + 'nLeftarrow;': '\u21cd', + 'nleftarrow;': '\u219a', + 'nLeftrightarrow;': '\u21ce', + 'nleftrightarrow;': '\u21ae', + 'nleq;': '\u2270', + 'nleqq;': '\u2266\u0338', + 'nleqslant;': '\u2a7d\u0338', + 'nles;': '\u2a7d\u0338', + 'nless;': '\u226e', + 'nLl;': '\u22d8\u0338', + 'nlsim;': '\u2274', + 'nLt;': '\u226a\u20d2', + 'nlt;': '\u226e', + 'nltri;': '\u22ea', + 'nltrie;': '\u22ec', + 'nLtv;': '\u226a\u0338', + 'nmid;': '\u2224', + 'NoBreak;': '\u2060', + 'NonBreakingSpace;': '\xa0', + 'Nopf;': '\u2115', + 'nopf;': '\U0001d55f', + 'not': '\xac', + 'Not;': '\u2aec', + 'not;': '\xac', + 'NotCongruent;': '\u2262', + 'NotCupCap;': '\u226d', + 'NotDoubleVerticalBar;': '\u2226', + 'NotElement;': '\u2209', + 'NotEqual;': '\u2260', + 'NotEqualTilde;': '\u2242\u0338', + 'NotExists;': '\u2204', + 'NotGreater;': '\u226f', + 'NotGreaterEqual;': '\u2271', + 'NotGreaterFullEqual;': '\u2267\u0338', + 'NotGreaterGreater;': '\u226b\u0338', + 'NotGreaterLess;': '\u2279', + 'NotGreaterSlantEqual;': '\u2a7e\u0338', + 'NotGreaterTilde;': '\u2275', + 'NotHumpDownHump;': '\u224e\u0338', + 'NotHumpEqual;': '\u224f\u0338', + 'notin;': '\u2209', + 'notindot;': '\u22f5\u0338', + 'notinE;': '\u22f9\u0338', + 'notinva;': '\u2209', + 'notinvb;': '\u22f7', + 'notinvc;': '\u22f6', + 'NotLeftTriangle;': '\u22ea', + 'NotLeftTriangleBar;': '\u29cf\u0338', + 'NotLeftTriangleEqual;': '\u22ec', + 'NotLess;': '\u226e', + 'NotLessEqual;': '\u2270', + 'NotLessGreater;': '\u2278', + 'NotLessLess;': '\u226a\u0338', + 'NotLessSlantEqual;': '\u2a7d\u0338', + 'NotLessTilde;': '\u2274', + 'NotNestedGreaterGreater;': '\u2aa2\u0338', + 'NotNestedLessLess;': '\u2aa1\u0338', + 'notni;': '\u220c', + 'notniva;': '\u220c', + 'notnivb;': '\u22fe', + 'notnivc;': '\u22fd', + 'NotPrecedes;': '\u2280', + 'NotPrecedesEqual;': '\u2aaf\u0338', + 'NotPrecedesSlantEqual;': '\u22e0', + 'NotReverseElement;': '\u220c', + 'NotRightTriangle;': '\u22eb', + 'NotRightTriangleBar;': '\u29d0\u0338', + 'NotRightTriangleEqual;': '\u22ed', + 'NotSquareSubset;': '\u228f\u0338', + 'NotSquareSubsetEqual;': '\u22e2', + 'NotSquareSuperset;': '\u2290\u0338', + 'NotSquareSupersetEqual;': '\u22e3', + 'NotSubset;': '\u2282\u20d2', + 'NotSubsetEqual;': '\u2288', + 'NotSucceeds;': '\u2281', + 'NotSucceedsEqual;': '\u2ab0\u0338', + 'NotSucceedsSlantEqual;': '\u22e1', + 'NotSucceedsTilde;': '\u227f\u0338', + 'NotSuperset;': '\u2283\u20d2', + 'NotSupersetEqual;': '\u2289', + 'NotTilde;': '\u2241', + 'NotTildeEqual;': '\u2244', + 'NotTildeFullEqual;': '\u2247', + 'NotTildeTilde;': '\u2249', + 'NotVerticalBar;': '\u2224', + 'npar;': '\u2226', + 'nparallel;': '\u2226', + 'nparsl;': '\u2afd\u20e5', + 'npart;': '\u2202\u0338', + 'npolint;': '\u2a14', + 'npr;': '\u2280', + 'nprcue;': '\u22e0', + 'npre;': '\u2aaf\u0338', + 'nprec;': '\u2280', + 'npreceq;': '\u2aaf\u0338', + 'nrArr;': '\u21cf', + 'nrarr;': '\u219b', + 'nrarrc;': '\u2933\u0338', + 'nrarrw;': '\u219d\u0338', + 'nRightarrow;': '\u21cf', + 'nrightarrow;': '\u219b', + 'nrtri;': '\u22eb', + 'nrtrie;': '\u22ed', + 'nsc;': '\u2281', + 'nsccue;': '\u22e1', + 'nsce;': '\u2ab0\u0338', + 'Nscr;': '\U0001d4a9', + 'nscr;': '\U0001d4c3', + 'nshortmid;': '\u2224', + 'nshortparallel;': '\u2226', + 'nsim;': '\u2241', + 'nsime;': '\u2244', + 'nsimeq;': '\u2244', + 'nsmid;': '\u2224', + 'nspar;': '\u2226', + 'nsqsube;': '\u22e2', + 'nsqsupe;': '\u22e3', + 'nsub;': '\u2284', + 'nsubE;': '\u2ac5\u0338', + 'nsube;': '\u2288', + 'nsubset;': '\u2282\u20d2', + 'nsubseteq;': '\u2288', + 'nsubseteqq;': '\u2ac5\u0338', + 'nsucc;': '\u2281', + 'nsucceq;': '\u2ab0\u0338', + 'nsup;': '\u2285', + 'nsupE;': '\u2ac6\u0338', + 'nsupe;': '\u2289', + 'nsupset;': '\u2283\u20d2', + 'nsupseteq;': '\u2289', + 'nsupseteqq;': '\u2ac6\u0338', + 'ntgl;': '\u2279', + 'Ntilde': '\xd1', + 'ntilde': '\xf1', + 'Ntilde;': '\xd1', + 'ntilde;': '\xf1', + 'ntlg;': '\u2278', + 'ntriangleleft;': '\u22ea', + 'ntrianglelefteq;': '\u22ec', + 'ntriangleright;': '\u22eb', + 'ntrianglerighteq;': '\u22ed', + 'Nu;': '\u039d', + 'nu;': '\u03bd', + 'num;': '#', + 'numero;': '\u2116', + 'numsp;': '\u2007', + 'nvap;': '\u224d\u20d2', + 'nVDash;': '\u22af', + 'nVdash;': '\u22ae', + 'nvDash;': '\u22ad', + 'nvdash;': '\u22ac', + 'nvge;': '\u2265\u20d2', + 'nvgt;': '>\u20d2', + 'nvHarr;': '\u2904', + 'nvinfin;': '\u29de', + 'nvlArr;': '\u2902', + 'nvle;': '\u2264\u20d2', + 'nvlt;': '<\u20d2', + 'nvltrie;': '\u22b4\u20d2', + 'nvrArr;': '\u2903', + 'nvrtrie;': '\u22b5\u20d2', + 'nvsim;': '\u223c\u20d2', + 'nwarhk;': '\u2923', + 'nwArr;': '\u21d6', + 'nwarr;': '\u2196', + 'nwarrow;': '\u2196', + 'nwnear;': '\u2927', + 'Oacute': '\xd3', + 'oacute': '\xf3', + 'Oacute;': '\xd3', + 'oacute;': '\xf3', + 'oast;': '\u229b', + 'ocir;': '\u229a', + 'Ocirc': '\xd4', + 'ocirc': '\xf4', + 'Ocirc;': '\xd4', + 'ocirc;': '\xf4', + 'Ocy;': '\u041e', + 'ocy;': '\u043e', + 'odash;': '\u229d', + 'Odblac;': '\u0150', + 'odblac;': '\u0151', + 'odiv;': '\u2a38', + 'odot;': '\u2299', + 'odsold;': '\u29bc', + 'OElig;': '\u0152', + 'oelig;': '\u0153', + 'ofcir;': '\u29bf', + 'Ofr;': '\U0001d512', + 'ofr;': '\U0001d52c', + 'ogon;': '\u02db', + 'Ograve': '\xd2', + 'ograve': '\xf2', + 'Ograve;': '\xd2', + 'ograve;': '\xf2', + 'ogt;': '\u29c1', + 'ohbar;': '\u29b5', + 'ohm;': '\u03a9', + 'oint;': '\u222e', + 'olarr;': '\u21ba', + 'olcir;': '\u29be', + 'olcross;': '\u29bb', + 'oline;': '\u203e', + 'olt;': '\u29c0', + 'Omacr;': '\u014c', + 'omacr;': '\u014d', + 'Omega;': '\u03a9', + 'omega;': '\u03c9', + 'Omicron;': '\u039f', + 'omicron;': '\u03bf', + 'omid;': '\u29b6', + 'ominus;': '\u2296', + 'Oopf;': '\U0001d546', + 'oopf;': '\U0001d560', + 'opar;': '\u29b7', + 'OpenCurlyDoubleQuote;': '\u201c', + 'OpenCurlyQuote;': '\u2018', + 'operp;': '\u29b9', + 'oplus;': '\u2295', + 'Or;': '\u2a54', + 'or;': '\u2228', + 'orarr;': '\u21bb', + 'ord;': '\u2a5d', + 'order;': '\u2134', + 'orderof;': '\u2134', + 'ordf': '\xaa', + 'ordf;': '\xaa', + 'ordm': '\xba', + 'ordm;': '\xba', + 'origof;': '\u22b6', + 'oror;': '\u2a56', + 'orslope;': '\u2a57', + 'orv;': '\u2a5b', + 'oS;': '\u24c8', + 'Oscr;': '\U0001d4aa', + 'oscr;': '\u2134', + 'Oslash': '\xd8', + 'oslash': '\xf8', + 'Oslash;': '\xd8', + 'oslash;': '\xf8', + 'osol;': '\u2298', + 'Otilde': '\xd5', + 'otilde': '\xf5', + 'Otilde;': '\xd5', + 'otilde;': '\xf5', + 'Otimes;': '\u2a37', + 'otimes;': '\u2297', + 'otimesas;': '\u2a36', + 'Ouml': '\xd6', + 'ouml': '\xf6', + 'Ouml;': '\xd6', + 'ouml;': '\xf6', + 'ovbar;': '\u233d', + 'OverBar;': '\u203e', + 'OverBrace;': '\u23de', + 'OverBracket;': '\u23b4', + 'OverParenthesis;': '\u23dc', + 'par;': '\u2225', + 'para': '\xb6', + 'para;': '\xb6', + 'parallel;': '\u2225', + 'parsim;': '\u2af3', + 'parsl;': '\u2afd', + 'part;': '\u2202', + 'PartialD;': '\u2202', + 'Pcy;': '\u041f', + 'pcy;': '\u043f', + 'percnt;': '%', + 'period;': '.', + 'permil;': '\u2030', + 'perp;': '\u22a5', + 'pertenk;': '\u2031', + 'Pfr;': '\U0001d513', + 'pfr;': '\U0001d52d', + 'Phi;': '\u03a6', + 'phi;': '\u03c6', + 'phiv;': '\u03d5', + 'phmmat;': '\u2133', + 'phone;': '\u260e', + 'Pi;': '\u03a0', + 'pi;': '\u03c0', + 'pitchfork;': '\u22d4', + 'piv;': '\u03d6', + 'planck;': '\u210f', + 'planckh;': '\u210e', + 'plankv;': '\u210f', + 'plus;': '+', + 'plusacir;': '\u2a23', + 'plusb;': '\u229e', + 'pluscir;': '\u2a22', + 'plusdo;': '\u2214', + 'plusdu;': '\u2a25', + 'pluse;': '\u2a72', + 'PlusMinus;': '\xb1', + 'plusmn': '\xb1', + 'plusmn;': '\xb1', + 'plussim;': '\u2a26', + 'plustwo;': '\u2a27', + 'pm;': '\xb1', + 'Poincareplane;': '\u210c', + 'pointint;': '\u2a15', + 'Popf;': '\u2119', + 'popf;': '\U0001d561', + 'pound': '\xa3', + 'pound;': '\xa3', + 'Pr;': '\u2abb', + 'pr;': '\u227a', + 'prap;': '\u2ab7', + 'prcue;': '\u227c', + 'prE;': '\u2ab3', + 'pre;': '\u2aaf', + 'prec;': '\u227a', + 'precapprox;': '\u2ab7', + 'preccurlyeq;': '\u227c', + 'Precedes;': '\u227a', + 'PrecedesEqual;': '\u2aaf', + 'PrecedesSlantEqual;': '\u227c', + 'PrecedesTilde;': '\u227e', + 'preceq;': '\u2aaf', + 'precnapprox;': '\u2ab9', + 'precneqq;': '\u2ab5', + 'precnsim;': '\u22e8', + 'precsim;': '\u227e', + 'Prime;': '\u2033', + 'prime;': '\u2032', + 'primes;': '\u2119', + 'prnap;': '\u2ab9', + 'prnE;': '\u2ab5', + 'prnsim;': '\u22e8', + 'prod;': '\u220f', + 'Product;': '\u220f', + 'profalar;': '\u232e', + 'profline;': '\u2312', + 'profsurf;': '\u2313', + 'prop;': '\u221d', + 'Proportion;': '\u2237', + 'Proportional;': '\u221d', + 'propto;': '\u221d', + 'prsim;': '\u227e', + 'prurel;': '\u22b0', + 'Pscr;': '\U0001d4ab', + 'pscr;': '\U0001d4c5', + 'Psi;': '\u03a8', + 'psi;': '\u03c8', + 'puncsp;': '\u2008', + 'Qfr;': '\U0001d514', + 'qfr;': '\U0001d52e', + 'qint;': '\u2a0c', + 'Qopf;': '\u211a', + 'qopf;': '\U0001d562', + 'qprime;': '\u2057', + 'Qscr;': '\U0001d4ac', + 'qscr;': '\U0001d4c6', + 'quaternions;': '\u210d', + 'quatint;': '\u2a16', + 'quest;': '?', + 'questeq;': '\u225f', + 'QUOT': '"', + 'quot': '"', + 'QUOT;': '"', + 'quot;': '"', + 'rAarr;': '\u21db', + 'race;': '\u223d\u0331', + 'Racute;': '\u0154', + 'racute;': '\u0155', + 'radic;': '\u221a', + 'raemptyv;': '\u29b3', + 'Rang;': '\u27eb', + 'rang;': '\u27e9', + 'rangd;': '\u2992', + 'range;': '\u29a5', + 'rangle;': '\u27e9', + 'raquo': '\xbb', + 'raquo;': '\xbb', + 'Rarr;': '\u21a0', + 'rArr;': '\u21d2', + 'rarr;': '\u2192', + 'rarrap;': '\u2975', + 'rarrb;': '\u21e5', + 'rarrbfs;': '\u2920', + 'rarrc;': '\u2933', + 'rarrfs;': '\u291e', + 'rarrhk;': '\u21aa', + 'rarrlp;': '\u21ac', + 'rarrpl;': '\u2945', + 'rarrsim;': '\u2974', + 'Rarrtl;': '\u2916', + 'rarrtl;': '\u21a3', + 'rarrw;': '\u219d', + 'rAtail;': '\u291c', + 'ratail;': '\u291a', + 'ratio;': '\u2236', + 'rationals;': '\u211a', + 'RBarr;': '\u2910', + 'rBarr;': '\u290f', + 'rbarr;': '\u290d', + 'rbbrk;': '\u2773', + 'rbrace;': '}', + 'rbrack;': ']', + 'rbrke;': '\u298c', + 'rbrksld;': '\u298e', + 'rbrkslu;': '\u2990', + 'Rcaron;': '\u0158', + 'rcaron;': '\u0159', + 'Rcedil;': '\u0156', + 'rcedil;': '\u0157', + 'rceil;': '\u2309', + 'rcub;': '}', + 'Rcy;': '\u0420', + 'rcy;': '\u0440', + 'rdca;': '\u2937', + 'rdldhar;': '\u2969', + 'rdquo;': '\u201d', + 'rdquor;': '\u201d', + 'rdsh;': '\u21b3', + 'Re;': '\u211c', + 'real;': '\u211c', + 'realine;': '\u211b', + 'realpart;': '\u211c', + 'reals;': '\u211d', + 'rect;': '\u25ad', + 'REG': '\xae', + 'reg': '\xae', + 'REG;': '\xae', + 'reg;': '\xae', + 'ReverseElement;': '\u220b', + 'ReverseEquilibrium;': '\u21cb', + 'ReverseUpEquilibrium;': '\u296f', + 'rfisht;': '\u297d', + 'rfloor;': '\u230b', + 'Rfr;': '\u211c', + 'rfr;': '\U0001d52f', + 'rHar;': '\u2964', + 'rhard;': '\u21c1', + 'rharu;': '\u21c0', + 'rharul;': '\u296c', + 'Rho;': '\u03a1', + 'rho;': '\u03c1', + 'rhov;': '\u03f1', + 'RightAngleBracket;': '\u27e9', + 'RightArrow;': '\u2192', + 'Rightarrow;': '\u21d2', + 'rightarrow;': '\u2192', + 'RightArrowBar;': '\u21e5', + 'RightArrowLeftArrow;': '\u21c4', + 'rightarrowtail;': '\u21a3', + 'RightCeiling;': '\u2309', + 'RightDoubleBracket;': '\u27e7', + 'RightDownTeeVector;': '\u295d', + 'RightDownVector;': '\u21c2', + 'RightDownVectorBar;': '\u2955', + 'RightFloor;': '\u230b', + 'rightharpoondown;': '\u21c1', + 'rightharpoonup;': '\u21c0', + 'rightleftarrows;': '\u21c4', + 'rightleftharpoons;': '\u21cc', + 'rightrightarrows;': '\u21c9', + 'rightsquigarrow;': '\u219d', + 'RightTee;': '\u22a2', + 'RightTeeArrow;': '\u21a6', + 'RightTeeVector;': '\u295b', + 'rightthreetimes;': '\u22cc', + 'RightTriangle;': '\u22b3', + 'RightTriangleBar;': '\u29d0', + 'RightTriangleEqual;': '\u22b5', + 'RightUpDownVector;': '\u294f', + 'RightUpTeeVector;': '\u295c', + 'RightUpVector;': '\u21be', + 'RightUpVectorBar;': '\u2954', + 'RightVector;': '\u21c0', + 'RightVectorBar;': '\u2953', + 'ring;': '\u02da', + 'risingdotseq;': '\u2253', + 'rlarr;': '\u21c4', + 'rlhar;': '\u21cc', + 'rlm;': '\u200f', + 'rmoust;': '\u23b1', + 'rmoustache;': '\u23b1', + 'rnmid;': '\u2aee', + 'roang;': '\u27ed', + 'roarr;': '\u21fe', + 'robrk;': '\u27e7', + 'ropar;': '\u2986', + 'Ropf;': '\u211d', + 'ropf;': '\U0001d563', + 'roplus;': '\u2a2e', + 'rotimes;': '\u2a35', + 'RoundImplies;': '\u2970', + 'rpar;': ')', + 'rpargt;': '\u2994', + 'rppolint;': '\u2a12', + 'rrarr;': '\u21c9', + 'Rrightarrow;': '\u21db', + 'rsaquo;': '\u203a', + 'Rscr;': '\u211b', + 'rscr;': '\U0001d4c7', + 'Rsh;': '\u21b1', + 'rsh;': '\u21b1', + 'rsqb;': ']', + 'rsquo;': '\u2019', + 'rsquor;': '\u2019', + 'rthree;': '\u22cc', + 'rtimes;': '\u22ca', + 'rtri;': '\u25b9', + 'rtrie;': '\u22b5', + 'rtrif;': '\u25b8', + 'rtriltri;': '\u29ce', + 'RuleDelayed;': '\u29f4', + 'ruluhar;': '\u2968', + 'rx;': '\u211e', + 'Sacute;': '\u015a', + 'sacute;': '\u015b', + 'sbquo;': '\u201a', + 'Sc;': '\u2abc', + 'sc;': '\u227b', + 'scap;': '\u2ab8', + 'Scaron;': '\u0160', + 'scaron;': '\u0161', + 'sccue;': '\u227d', + 'scE;': '\u2ab4', + 'sce;': '\u2ab0', + 'Scedil;': '\u015e', + 'scedil;': '\u015f', + 'Scirc;': '\u015c', + 'scirc;': '\u015d', + 'scnap;': '\u2aba', + 'scnE;': '\u2ab6', + 'scnsim;': '\u22e9', + 'scpolint;': '\u2a13', + 'scsim;': '\u227f', + 'Scy;': '\u0421', + 'scy;': '\u0441', + 'sdot;': '\u22c5', + 'sdotb;': '\u22a1', + 'sdote;': '\u2a66', + 'searhk;': '\u2925', + 'seArr;': '\u21d8', + 'searr;': '\u2198', + 'searrow;': '\u2198', + 'sect': '\xa7', + 'sect;': '\xa7', + 'semi;': ';', + 'seswar;': '\u2929', + 'setminus;': '\u2216', + 'setmn;': '\u2216', + 'sext;': '\u2736', + 'Sfr;': '\U0001d516', + 'sfr;': '\U0001d530', + 'sfrown;': '\u2322', + 'sharp;': '\u266f', + 'SHCHcy;': '\u0429', + 'shchcy;': '\u0449', + 'SHcy;': '\u0428', + 'shcy;': '\u0448', + 'ShortDownArrow;': '\u2193', + 'ShortLeftArrow;': '\u2190', + 'shortmid;': '\u2223', + 'shortparallel;': '\u2225', + 'ShortRightArrow;': '\u2192', + 'ShortUpArrow;': '\u2191', + 'shy': '\xad', + 'shy;': '\xad', + 'Sigma;': '\u03a3', + 'sigma;': '\u03c3', + 'sigmaf;': '\u03c2', + 'sigmav;': '\u03c2', + 'sim;': '\u223c', + 'simdot;': '\u2a6a', + 'sime;': '\u2243', + 'simeq;': '\u2243', + 'simg;': '\u2a9e', + 'simgE;': '\u2aa0', + 'siml;': '\u2a9d', + 'simlE;': '\u2a9f', + 'simne;': '\u2246', + 'simplus;': '\u2a24', + 'simrarr;': '\u2972', + 'slarr;': '\u2190', + 'SmallCircle;': '\u2218', + 'smallsetminus;': '\u2216', + 'smashp;': '\u2a33', + 'smeparsl;': '\u29e4', + 'smid;': '\u2223', + 'smile;': '\u2323', + 'smt;': '\u2aaa', + 'smte;': '\u2aac', + 'smtes;': '\u2aac\ufe00', + 'SOFTcy;': '\u042c', + 'softcy;': '\u044c', + 'sol;': '/', + 'solb;': '\u29c4', + 'solbar;': '\u233f', + 'Sopf;': '\U0001d54a', + 'sopf;': '\U0001d564', + 'spades;': '\u2660', + 'spadesuit;': '\u2660', + 'spar;': '\u2225', + 'sqcap;': '\u2293', + 'sqcaps;': '\u2293\ufe00', + 'sqcup;': '\u2294', + 'sqcups;': '\u2294\ufe00', + 'Sqrt;': '\u221a', + 'sqsub;': '\u228f', + 'sqsube;': '\u2291', + 'sqsubset;': '\u228f', + 'sqsubseteq;': '\u2291', + 'sqsup;': '\u2290', + 'sqsupe;': '\u2292', + 'sqsupset;': '\u2290', + 'sqsupseteq;': '\u2292', + 'squ;': '\u25a1', + 'Square;': '\u25a1', + 'square;': '\u25a1', + 'SquareIntersection;': '\u2293', + 'SquareSubset;': '\u228f', + 'SquareSubsetEqual;': '\u2291', + 'SquareSuperset;': '\u2290', + 'SquareSupersetEqual;': '\u2292', + 'SquareUnion;': '\u2294', + 'squarf;': '\u25aa', + 'squf;': '\u25aa', + 'srarr;': '\u2192', + 'Sscr;': '\U0001d4ae', + 'sscr;': '\U0001d4c8', + 'ssetmn;': '\u2216', + 'ssmile;': '\u2323', + 'sstarf;': '\u22c6', + 'Star;': '\u22c6', + 'star;': '\u2606', + 'starf;': '\u2605', + 'straightepsilon;': '\u03f5', + 'straightphi;': '\u03d5', + 'strns;': '\xaf', + 'Sub;': '\u22d0', + 'sub;': '\u2282', + 'subdot;': '\u2abd', + 'subE;': '\u2ac5', + 'sube;': '\u2286', + 'subedot;': '\u2ac3', + 'submult;': '\u2ac1', + 'subnE;': '\u2acb', + 'subne;': '\u228a', + 'subplus;': '\u2abf', + 'subrarr;': '\u2979', + 'Subset;': '\u22d0', + 'subset;': '\u2282', + 'subseteq;': '\u2286', + 'subseteqq;': '\u2ac5', + 'SubsetEqual;': '\u2286', + 'subsetneq;': '\u228a', + 'subsetneqq;': '\u2acb', + 'subsim;': '\u2ac7', + 'subsub;': '\u2ad5', + 'subsup;': '\u2ad3', + 'succ;': '\u227b', + 'succapprox;': '\u2ab8', + 'succcurlyeq;': '\u227d', + 'Succeeds;': '\u227b', + 'SucceedsEqual;': '\u2ab0', + 'SucceedsSlantEqual;': '\u227d', + 'SucceedsTilde;': '\u227f', + 'succeq;': '\u2ab0', + 'succnapprox;': '\u2aba', + 'succneqq;': '\u2ab6', + 'succnsim;': '\u22e9', + 'succsim;': '\u227f', + 'SuchThat;': '\u220b', + 'Sum;': '\u2211', + 'sum;': '\u2211', + 'sung;': '\u266a', + 'sup1': '\xb9', + 'sup1;': '\xb9', + 'sup2': '\xb2', + 'sup2;': '\xb2', + 'sup3': '\xb3', + 'sup3;': '\xb3', + 'Sup;': '\u22d1', + 'sup;': '\u2283', + 'supdot;': '\u2abe', + 'supdsub;': '\u2ad8', + 'supE;': '\u2ac6', + 'supe;': '\u2287', + 'supedot;': '\u2ac4', + 'Superset;': '\u2283', + 'SupersetEqual;': '\u2287', + 'suphsol;': '\u27c9', + 'suphsub;': '\u2ad7', + 'suplarr;': '\u297b', + 'supmult;': '\u2ac2', + 'supnE;': '\u2acc', + 'supne;': '\u228b', + 'supplus;': '\u2ac0', + 'Supset;': '\u22d1', + 'supset;': '\u2283', + 'supseteq;': '\u2287', + 'supseteqq;': '\u2ac6', + 'supsetneq;': '\u228b', + 'supsetneqq;': '\u2acc', + 'supsim;': '\u2ac8', + 'supsub;': '\u2ad4', + 'supsup;': '\u2ad6', + 'swarhk;': '\u2926', + 'swArr;': '\u21d9', + 'swarr;': '\u2199', + 'swarrow;': '\u2199', + 'swnwar;': '\u292a', + 'szlig': '\xdf', + 'szlig;': '\xdf', + 'Tab;': '\t', + 'target;': '\u2316', + 'Tau;': '\u03a4', + 'tau;': '\u03c4', + 'tbrk;': '\u23b4', + 'Tcaron;': '\u0164', + 'tcaron;': '\u0165', + 'Tcedil;': '\u0162', + 'tcedil;': '\u0163', + 'Tcy;': '\u0422', + 'tcy;': '\u0442', + 'tdot;': '\u20db', + 'telrec;': '\u2315', + 'Tfr;': '\U0001d517', + 'tfr;': '\U0001d531', + 'there4;': '\u2234', + 'Therefore;': '\u2234', + 'therefore;': '\u2234', + 'Theta;': '\u0398', + 'theta;': '\u03b8', + 'thetasym;': '\u03d1', + 'thetav;': '\u03d1', + 'thickapprox;': '\u2248', + 'thicksim;': '\u223c', + 'ThickSpace;': '\u205f\u200a', + 'thinsp;': '\u2009', + 'ThinSpace;': '\u2009', + 'thkap;': '\u2248', + 'thksim;': '\u223c', + 'THORN': '\xde', + 'thorn': '\xfe', + 'THORN;': '\xde', + 'thorn;': '\xfe', + 'Tilde;': '\u223c', + 'tilde;': '\u02dc', + 'TildeEqual;': '\u2243', + 'TildeFullEqual;': '\u2245', + 'TildeTilde;': '\u2248', + 'times': '\xd7', + 'times;': '\xd7', + 'timesb;': '\u22a0', + 'timesbar;': '\u2a31', + 'timesd;': '\u2a30', + 'tint;': '\u222d', + 'toea;': '\u2928', + 'top;': '\u22a4', + 'topbot;': '\u2336', + 'topcir;': '\u2af1', + 'Topf;': '\U0001d54b', + 'topf;': '\U0001d565', + 'topfork;': '\u2ada', + 'tosa;': '\u2929', + 'tprime;': '\u2034', + 'TRADE;': '\u2122', + 'trade;': '\u2122', + 'triangle;': '\u25b5', + 'triangledown;': '\u25bf', + 'triangleleft;': '\u25c3', + 'trianglelefteq;': '\u22b4', + 'triangleq;': '\u225c', + 'triangleright;': '\u25b9', + 'trianglerighteq;': '\u22b5', + 'tridot;': '\u25ec', + 'trie;': '\u225c', + 'triminus;': '\u2a3a', + 'TripleDot;': '\u20db', + 'triplus;': '\u2a39', + 'trisb;': '\u29cd', + 'tritime;': '\u2a3b', + 'trpezium;': '\u23e2', + 'Tscr;': '\U0001d4af', + 'tscr;': '\U0001d4c9', + 'TScy;': '\u0426', + 'tscy;': '\u0446', + 'TSHcy;': '\u040b', + 'tshcy;': '\u045b', + 'Tstrok;': '\u0166', + 'tstrok;': '\u0167', + 'twixt;': '\u226c', + 'twoheadleftarrow;': '\u219e', + 'twoheadrightarrow;': '\u21a0', + 'Uacute': '\xda', + 'uacute': '\xfa', + 'Uacute;': '\xda', + 'uacute;': '\xfa', + 'Uarr;': '\u219f', + 'uArr;': '\u21d1', + 'uarr;': '\u2191', + 'Uarrocir;': '\u2949', + 'Ubrcy;': '\u040e', + 'ubrcy;': '\u045e', + 'Ubreve;': '\u016c', + 'ubreve;': '\u016d', + 'Ucirc': '\xdb', + 'ucirc': '\xfb', + 'Ucirc;': '\xdb', + 'ucirc;': '\xfb', + 'Ucy;': '\u0423', + 'ucy;': '\u0443', + 'udarr;': '\u21c5', + 'Udblac;': '\u0170', + 'udblac;': '\u0171', + 'udhar;': '\u296e', + 'ufisht;': '\u297e', + 'Ufr;': '\U0001d518', + 'ufr;': '\U0001d532', + 'Ugrave': '\xd9', + 'ugrave': '\xf9', + 'Ugrave;': '\xd9', + 'ugrave;': '\xf9', + 'uHar;': '\u2963', + 'uharl;': '\u21bf', + 'uharr;': '\u21be', + 'uhblk;': '\u2580', + 'ulcorn;': '\u231c', + 'ulcorner;': '\u231c', + 'ulcrop;': '\u230f', + 'ultri;': '\u25f8', + 'Umacr;': '\u016a', + 'umacr;': '\u016b', + 'uml': '\xa8', + 'uml;': '\xa8', + 'UnderBar;': '_', + 'UnderBrace;': '\u23df', + 'UnderBracket;': '\u23b5', + 'UnderParenthesis;': '\u23dd', + 'Union;': '\u22c3', + 'UnionPlus;': '\u228e', + 'Uogon;': '\u0172', + 'uogon;': '\u0173', + 'Uopf;': '\U0001d54c', + 'uopf;': '\U0001d566', + 'UpArrow;': '\u2191', + 'Uparrow;': '\u21d1', + 'uparrow;': '\u2191', + 'UpArrowBar;': '\u2912', + 'UpArrowDownArrow;': '\u21c5', + 'UpDownArrow;': '\u2195', + 'Updownarrow;': '\u21d5', + 'updownarrow;': '\u2195', + 'UpEquilibrium;': '\u296e', + 'upharpoonleft;': '\u21bf', + 'upharpoonright;': '\u21be', + 'uplus;': '\u228e', + 'UpperLeftArrow;': '\u2196', + 'UpperRightArrow;': '\u2197', + 'Upsi;': '\u03d2', + 'upsi;': '\u03c5', + 'upsih;': '\u03d2', + 'Upsilon;': '\u03a5', + 'upsilon;': '\u03c5', + 'UpTee;': '\u22a5', + 'UpTeeArrow;': '\u21a5', + 'upuparrows;': '\u21c8', + 'urcorn;': '\u231d', + 'urcorner;': '\u231d', + 'urcrop;': '\u230e', + 'Uring;': '\u016e', + 'uring;': '\u016f', + 'urtri;': '\u25f9', + 'Uscr;': '\U0001d4b0', + 'uscr;': '\U0001d4ca', + 'utdot;': '\u22f0', + 'Utilde;': '\u0168', + 'utilde;': '\u0169', + 'utri;': '\u25b5', + 'utrif;': '\u25b4', + 'uuarr;': '\u21c8', + 'Uuml': '\xdc', + 'uuml': '\xfc', + 'Uuml;': '\xdc', + 'uuml;': '\xfc', + 'uwangle;': '\u29a7', + 'vangrt;': '\u299c', + 'varepsilon;': '\u03f5', + 'varkappa;': '\u03f0', + 'varnothing;': '\u2205', + 'varphi;': '\u03d5', + 'varpi;': '\u03d6', + 'varpropto;': '\u221d', + 'vArr;': '\u21d5', + 'varr;': '\u2195', + 'varrho;': '\u03f1', + 'varsigma;': '\u03c2', + 'varsubsetneq;': '\u228a\ufe00', + 'varsubsetneqq;': '\u2acb\ufe00', + 'varsupsetneq;': '\u228b\ufe00', + 'varsupsetneqq;': '\u2acc\ufe00', + 'vartheta;': '\u03d1', + 'vartriangleleft;': '\u22b2', + 'vartriangleright;': '\u22b3', + 'Vbar;': '\u2aeb', + 'vBar;': '\u2ae8', + 'vBarv;': '\u2ae9', + 'Vcy;': '\u0412', + 'vcy;': '\u0432', + 'VDash;': '\u22ab', + 'Vdash;': '\u22a9', + 'vDash;': '\u22a8', + 'vdash;': '\u22a2', + 'Vdashl;': '\u2ae6', + 'Vee;': '\u22c1', + 'vee;': '\u2228', + 'veebar;': '\u22bb', + 'veeeq;': '\u225a', + 'vellip;': '\u22ee', + 'Verbar;': '\u2016', + 'verbar;': '|', + 'Vert;': '\u2016', + 'vert;': '|', + 'VerticalBar;': '\u2223', + 'VerticalLine;': '|', + 'VerticalSeparator;': '\u2758', + 'VerticalTilde;': '\u2240', + 'VeryThinSpace;': '\u200a', + 'Vfr;': '\U0001d519', + 'vfr;': '\U0001d533', + 'vltri;': '\u22b2', + 'vnsub;': '\u2282\u20d2', + 'vnsup;': '\u2283\u20d2', + 'Vopf;': '\U0001d54d', + 'vopf;': '\U0001d567', + 'vprop;': '\u221d', + 'vrtri;': '\u22b3', + 'Vscr;': '\U0001d4b1', + 'vscr;': '\U0001d4cb', + 'vsubnE;': '\u2acb\ufe00', + 'vsubne;': '\u228a\ufe00', + 'vsupnE;': '\u2acc\ufe00', + 'vsupne;': '\u228b\ufe00', + 'Vvdash;': '\u22aa', + 'vzigzag;': '\u299a', + 'Wcirc;': '\u0174', + 'wcirc;': '\u0175', + 'wedbar;': '\u2a5f', + 'Wedge;': '\u22c0', + 'wedge;': '\u2227', + 'wedgeq;': '\u2259', + 'weierp;': '\u2118', + 'Wfr;': '\U0001d51a', + 'wfr;': '\U0001d534', + 'Wopf;': '\U0001d54e', + 'wopf;': '\U0001d568', + 'wp;': '\u2118', + 'wr;': '\u2240', + 'wreath;': '\u2240', + 'Wscr;': '\U0001d4b2', + 'wscr;': '\U0001d4cc', + 'xcap;': '\u22c2', + 'xcirc;': '\u25ef', + 'xcup;': '\u22c3', + 'xdtri;': '\u25bd', + 'Xfr;': '\U0001d51b', + 'xfr;': '\U0001d535', + 'xhArr;': '\u27fa', + 'xharr;': '\u27f7', + 'Xi;': '\u039e', + 'xi;': '\u03be', + 'xlArr;': '\u27f8', + 'xlarr;': '\u27f5', + 'xmap;': '\u27fc', + 'xnis;': '\u22fb', + 'xodot;': '\u2a00', + 'Xopf;': '\U0001d54f', + 'xopf;': '\U0001d569', + 'xoplus;': '\u2a01', + 'xotime;': '\u2a02', + 'xrArr;': '\u27f9', + 'xrarr;': '\u27f6', + 'Xscr;': '\U0001d4b3', + 'xscr;': '\U0001d4cd', + 'xsqcup;': '\u2a06', + 'xuplus;': '\u2a04', + 'xutri;': '\u25b3', + 'xvee;': '\u22c1', + 'xwedge;': '\u22c0', + 'Yacute': '\xdd', + 'yacute': '\xfd', + 'Yacute;': '\xdd', + 'yacute;': '\xfd', + 'YAcy;': '\u042f', + 'yacy;': '\u044f', + 'Ycirc;': '\u0176', + 'ycirc;': '\u0177', + 'Ycy;': '\u042b', + 'ycy;': '\u044b', + 'yen': '\xa5', + 'yen;': '\xa5', + 'Yfr;': '\U0001d51c', + 'yfr;': '\U0001d536', + 'YIcy;': '\u0407', + 'yicy;': '\u0457', + 'Yopf;': '\U0001d550', + 'yopf;': '\U0001d56a', + 'Yscr;': '\U0001d4b4', + 'yscr;': '\U0001d4ce', + 'YUcy;': '\u042e', + 'yucy;': '\u044e', + 'yuml': '\xff', + 'Yuml;': '\u0178', + 'yuml;': '\xff', + 'Zacute;': '\u0179', + 'zacute;': '\u017a', + 'Zcaron;': '\u017d', + 'zcaron;': '\u017e', + 'Zcy;': '\u0417', + 'zcy;': '\u0437', + 'Zdot;': '\u017b', + 'zdot;': '\u017c', + 'zeetrf;': '\u2128', + 'ZeroWidthSpace;': '\u200b', + 'Zeta;': '\u0396', + 'zeta;': '\u03b6', + 'Zfr;': '\u2128', + 'zfr;': '\U0001d537', + 'ZHcy;': '\u0416', + 'zhcy;': '\u0436', + 'zigrarr;': '\u21dd', + 'Zopf;': '\u2124', + 'zopf;': '\U0001d56b', + 'Zscr;': '\U0001d4b5', + 'zscr;': '\U0001d4cf', + 'zwj;': '\u200d', + 'zwnj;': '\u200c', +} + +# maps the Unicode code point to the HTML entity name +codepoint2name = {} + +# maps the HTML entity name to the character +# (or a character reference if the character is outside the Latin-1 range) +entitydefs = {} + +for (name, codepoint) in name2codepoint.items(): + codepoint2name[codepoint] = name + entitydefs[name] = chr(codepoint) + +del name, codepoint diff --git a/Lib/html/parser.py b/Lib/html/parser.py new file mode 100644 index 0000000000..ef869bc72d --- /dev/null +++ b/Lib/html/parser.py @@ -0,0 +1,470 @@ +"""A parser for HTML and XHTML.""" + +# This file is based on sgmllib.py, but the API is slightly different. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). + + +import re +import warnings +import _markupbase + +from html import unescape + + +__all__ = ['HTMLParser'] + +# Regular expressions used for parsing + +interesting_normal = re.compile('[&<]') +incomplete = re.compile('&[a-zA-Z#]') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') + +starttagopen = re.compile('<[a-zA-Z]') +piclose = re.compile('>') +commentclose = re.compile(r'--\s*>') +# Note: +# 1) if you change tagfind/attrfind remember to update locatestarttagend too; +# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will +# explode, so don't do it. +# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state +# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state +tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') +attrfind_tolerant = re.compile( + r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') +locatestarttagend_tolerant = re.compile(r""" + <[a-zA-Z][^\t\n\r\f />\x00]* # tag name + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name + (?:\s*=+\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\s]* # bare value + ) + (?:\s*,)* # possibly followed by a comma + )?(?:\s|/(?!>))* + )* + )? + \s* # trailing whitespace +""", re.VERBOSE) +endendtag = re.compile('>') +# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between +# ') + + + +class HTMLParser(_markupbase.ParserBase): + """Find tags and other markup and call handler functions. + + Usage: + p = HTMLParser() + p.feed(data) + ... + p.close() + + Start tags are handled by calling self.handle_starttag() or + self.handle_startendtag(); end tags by self.handle_endtag(). The + data between tags is passed from the parser to the derived class + by calling self.handle_data() with the data as argument (the data + may be split up in arbitrary chunks). If convert_charrefs is + True the character references are converted automatically to the + corresponding Unicode character (and self.handle_data() is no + longer split in chunks), otherwise they are passed by calling + self.handle_entityref() or self.handle_charref() with the string + containing respectively the named or numeric reference as the + argument. + """ + + CDATA_CONTENT_ELEMENTS = ("script", "style") + + def __init__(self, *, convert_charrefs=True): + """Initialize and reset this instance. + + If convert_charrefs is True (the default), all character references + are automatically converted to the corresponding Unicode characters. + """ + self.convert_charrefs = convert_charrefs + self.reset() + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.rawdata = '' + self.lasttag = '???' + self.interesting = interesting_normal + self.cdata_elem = None + _markupbase.ParserBase.reset(self) + + def feed(self, data): + r"""Feed data to the parser. + + Call this as often as you want, with as little or as much text + as you want (may include '\n'). + """ + self.rawdata = self.rawdata + data + self.goahead(0) + + def close(self): + """Handle any buffered data.""" + self.goahead(1) + + __starttag_text = None + + def get_starttag_text(self): + """Return full source of start tag: '<...>'.""" + return self.__starttag_text + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'' % self.cdata_elem, re.I) + + def clear_cdata_mode(self): + self.interesting = interesting_normal + self.cdata_elem = None + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.convert_charrefs and not self.cdata_elem: + j = rawdata.find('<', i) + if j < 0: + # if we can't find the next <, either we are at the end + # or there's more text incoming. If the latter is True, + # we can't pass the text to handle_data in case we have + # a charref cut in half at end. Try to determine if + # this is the case before proceeding by looking for an + # & near the end and see if it's followed by a space or ;. + amppos = rawdata.rfind('&', max(i, n-34)) + if (amppos >= 0 and + not re.compile(r'[\s;]').search(rawdata, amppos)): + break # wait till we get all the text + j = n + else: + match = self.interesting.search(rawdata, i) # < or & + if match: + j = match.start() + else: + if self.cdata_elem: + break + j = n + if i < j: + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:j])) + else: + self.handle_data(rawdata[i:j]) + i = self.updatepos(i, j) + if i == n: break + startswith = rawdata.startswith + if startswith('<', i): + if starttagopen.match(rawdata, i): # < + letter + k = self.parse_starttag(i) + elif startswith("', i + 1) + if k < 0: + k = rawdata.find('<', i + 1) + if k < 0: + k = i + 1 + else: + k += 1 + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:k])) + else: + self.handle_data(rawdata[i:k]) + i = self.updatepos(i, k) + elif startswith("&#", i): + match = charref.match(rawdata, i) + if match: + name = match.group()[2:-1] + self.handle_charref(name) + k = match.end() + if not startswith(';', k-1): + k = k - 1 + i = self.updatepos(i, k) + continue + else: + if ";" in rawdata[i:]: # bail by consuming &# + self.handle_data(rawdata[i:i+2]) + i = self.updatepos(i, i+2) + break + elif startswith('&', i): + match = entityref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_entityref(name) + k = match.end() + if not startswith(';', k-1): + k = k - 1 + i = self.updatepos(i, k) + continue + match = incomplete.match(rawdata, i) + if match: + # match.group() will contain at least 2 chars + if end and match.group() == rawdata[i:]: + k = match.end() + if k <= i: + k = n + i = self.updatepos(i, i + 1) + # incomplete + break + elif (i + 1) < n: + # not the end of the buffer, and can't be confused + # with some other construct + self.handle_data("&") + i = self.updatepos(i, i + 1) + else: + break + else: + assert 0, "interesting.search() lied" + # end while + if end and i < n and not self.cdata_elem: + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:n])) + else: + self.handle_data(rawdata[i:n]) + i = self.updatepos(i, n) + self.rawdata = rawdata[i:] + + # Internal -- parse html declarations, return length or -1 if not terminated + # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state + # See also parse_declaration in _markupbase + def parse_html_declaration(self, i): + rawdata = self.rawdata + assert rawdata[i:i+2] == ' + gtpos = rawdata.find('>', i+9) + if gtpos == -1: + return -1 + self.handle_decl(rawdata[i+2:gtpos]) + return gtpos+1 + else: + return self.parse_bogus_comment(i) + + # Internal -- parse bogus comment, return length or -1 if not terminated + # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + def parse_bogus_comment(self, i, report=1): + rawdata = self.rawdata + assert rawdata[i:i+2] in ('', i+2) + if pos == -1: + return -1 + if report: + self.handle_comment(rawdata[i+2:pos]) + return pos + 1 + + # Internal -- parse processing instr, return end or -1 if not terminated + def parse_pi(self, i): + rawdata = self.rawdata + assert rawdata[i:i+2] == ' + if not match: + return -1 + j = match.start() + self.handle_pi(rawdata[i+2: j]) + j = match.end() + return j + + # Internal -- handle starttag, return end or -1 if not terminated + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind_tolerant.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = match.group(1).lower() + while k < endpos: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + # Internal -- check to see if we have a complete starttag; return end + # or -1 if incomplete. + def check_for_whole_start_tag(self, i): + rawdata = self.rawdata + m = locatestarttagend_tolerant.match(rawdata, i) + if m: + j = m.end() + next = rawdata[j:j+1] + if next == ">": + return j + 1 + if next == "/": + if rawdata.startswith("/>", j): + return j + 2 + if rawdata.startswith("/", j): + # buffer boundary + return -1 + # else bogus input + if j > i: + return j + else: + return i + 1 + if next == "": + # end of input + return -1 + if next in ("abcdefghijklmnopqrstuvwxyz=/" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): + # end of input in or before attribute value, or we have the + # '/' from a '/>' ending + return -1 + if j > i: + return j + else: + return i + 1 + raise AssertionError("we should not get here!") + + # Internal -- parse endtag, return end or -1 if incomplete + def parse_endtag(self, i): + rawdata = self.rawdata + assert rawdata[i:i+2] == " + if not match: + return -1 + gtpos = match.end() + match = endtagfind.match(rawdata, i) # + if not match: + if self.cdata_elem is not None: + self.handle_data(rawdata[i:gtpos]) + return gtpos + # find the name: w3.org/TR/html5/tokenization.html#tag-name-state + namematch = tagfind_tolerant.match(rawdata, i+2) + if not namematch: + # w3.org/TR/html5/tokenization.html#end-tag-open-state + if rawdata[i:i+3] == '': + return i+3 + else: + return self.parse_bogus_comment(i) + tagname = namematch.group(1).lower() + # consume and ignore other stuff between the name and the > + # Note: this is not 100% correct, since we might have things like + # , but looking for > after tha name should cover + # most of the cases and is much simpler + gtpos = rawdata.find('>', namematch.end()) + self.handle_endtag(tagname) + return gtpos+1 + + elem = match.group(1).lower() # script or style + if self.cdata_elem is not None: + if elem != self.cdata_elem: + self.handle_data(rawdata[i:gtpos]) + return gtpos + + self.handle_endtag(elem.lower()) + self.clear_cdata_mode() + return gtpos + + # Overridable -- finish processing of start+end tag: + def handle_startendtag(self, tag, attrs): + self.handle_starttag(tag, attrs) + self.handle_endtag(tag) + + # Overridable -- handle start tag + def handle_starttag(self, tag, attrs): + pass + + # Overridable -- handle end tag + def handle_endtag(self, tag): + pass + + # Overridable -- handle character reference + def handle_charref(self, name): + pass + + # Overridable -- handle entity reference + def handle_entityref(self, name): + pass + + # Overridable -- handle data + def handle_data(self, data): + pass + + # Overridable -- handle comment + def handle_comment(self, data): + pass + + # Overridable -- handle declaration + def handle_decl(self, decl): + pass + + # Overridable -- handle processing instruction + def handle_pi(self, data): + pass + + def unknown_decl(self, data): + pass + + # Internal -- helper to remove special character quoting + def unescape(self, s): + warnings.warn('The unescape method is deprecated and will be removed ' + 'in 3.5, use html.unescape() instead.', + DeprecationWarning, stacklevel=2) + return unescape(s) diff --git a/Lib/http/__init__.py b/Lib/http/__init__.py new file mode 100644 index 0000000000..d4334cc88f --- /dev/null +++ b/Lib/http/__init__.py @@ -0,0 +1,134 @@ +from enum import IntEnum + +__all__ = ['HTTPStatus'] + +class HTTPStatus(IntEnum): + """HTTP status codes and reason phrases + + Status codes from the following RFCs are all observed: + + * RFC 7231: Hypertext Transfer Protocol (HTTP/1.1), obsoletes 2616 + * RFC 6585: Additional HTTP Status Codes + * RFC 3229: Delta encoding in HTTP + * RFC 4918: HTTP Extensions for WebDAV, obsoletes 2518 + * RFC 5842: Binding Extensions to WebDAV + * RFC 7238: Permanent Redirect + * RFC 2295: Transparent Content Negotiation in HTTP + * RFC 2774: An HTTP Extension Framework + """ + def __new__(cls, value, phrase, description=''): + obj = int.__new__(cls, value) + obj._value_ = value + + obj.phrase = phrase + obj.description = description + return obj + + # informational + CONTINUE = 100, 'Continue', 'Request received, please continue' + SWITCHING_PROTOCOLS = (101, 'Switching Protocols', + 'Switching to new protocol; obey Upgrade header') + PROCESSING = 102, 'Processing' + + # success + OK = 200, 'OK', 'Request fulfilled, document follows' + CREATED = 201, 'Created', 'Document created, URL follows' + ACCEPTED = (202, 'Accepted', + 'Request accepted, processing continues off-line') + NON_AUTHORITATIVE_INFORMATION = (203, + 'Non-Authoritative Information', 'Request fulfilled from cache') + NO_CONTENT = 204, 'No Content', 'Request fulfilled, nothing follows' + RESET_CONTENT = 205, 'Reset Content', 'Clear input form for further input' + PARTIAL_CONTENT = 206, 'Partial Content', 'Partial content follows' + MULTI_STATUS = 207, 'Multi-Status' + ALREADY_REPORTED = 208, 'Already Reported' + IM_USED = 226, 'IM Used' + + # redirection + MULTIPLE_CHOICES = (300, 'Multiple Choices', + 'Object has several resources -- see URI list') + MOVED_PERMANENTLY = (301, 'Moved Permanently', + 'Object moved permanently -- see URI list') + FOUND = 302, 'Found', 'Object moved temporarily -- see URI list' + SEE_OTHER = 303, 'See Other', 'Object moved -- see Method and URL list' + NOT_MODIFIED = (304, 'Not Modified', + 'Document has not changed since given time') + USE_PROXY = (305, 'Use Proxy', + 'You must use proxy specified in Location to access this resource') + TEMPORARY_REDIRECT = (307, 'Temporary Redirect', + 'Object moved temporarily -- see URI list') + PERMANENT_REDIRECT = (308, 'Permanent Redirect', + 'Object moved temporarily -- see URI list') + + # client error + BAD_REQUEST = (400, 'Bad Request', + 'Bad request syntax or unsupported method') + UNAUTHORIZED = (401, 'Unauthorized', + 'No permission -- see authorization schemes') + PAYMENT_REQUIRED = (402, 'Payment Required', + 'No payment -- see charging schemes') + FORBIDDEN = (403, 'Forbidden', + 'Request forbidden -- authorization will not help') + NOT_FOUND = (404, 'Not Found', + 'Nothing matches the given URI') + METHOD_NOT_ALLOWED = (405, 'Method Not Allowed', + 'Specified method is invalid for this resource') + NOT_ACCEPTABLE = (406, 'Not Acceptable', + 'URI not available in preferred format') + PROXY_AUTHENTICATION_REQUIRED = (407, + 'Proxy Authentication Required', + 'You must authenticate with this proxy before proceeding') + REQUEST_TIMEOUT = (408, 'Request Timeout', + 'Request timed out; try again later') + CONFLICT = 409, 'Conflict', 'Request conflict' + GONE = (410, 'Gone', + 'URI no longer exists and has been permanently removed') + LENGTH_REQUIRED = (411, 'Length Required', + 'Client must specify Content-Length') + PRECONDITION_FAILED = (412, 'Precondition Failed', + 'Precondition in headers is false') + REQUEST_ENTITY_TOO_LARGE = (413, 'Request Entity Too Large', + 'Entity is too large') + REQUEST_URI_TOO_LONG = (414, 'Request-URI Too Long', + 'URI is too long') + UNSUPPORTED_MEDIA_TYPE = (415, 'Unsupported Media Type', + 'Entity body in unsupported format') + REQUESTED_RANGE_NOT_SATISFIABLE = (416, + 'Requested Range Not Satisfiable', + 'Cannot satisfy request range') + EXPECTATION_FAILED = (417, 'Expectation Failed', + 'Expect condition could not be satisfied') + UNPROCESSABLE_ENTITY = 422, 'Unprocessable Entity' + LOCKED = 423, 'Locked' + FAILED_DEPENDENCY = 424, 'Failed Dependency' + UPGRADE_REQUIRED = 426, 'Upgrade Required' + PRECONDITION_REQUIRED = (428, 'Precondition Required', + 'The origin server requires the request to be conditional') + TOO_MANY_REQUESTS = (429, 'Too Many Requests', + 'The user has sent too many requests in ' + 'a given amount of time ("rate limiting")') + REQUEST_HEADER_FIELDS_TOO_LARGE = (431, + 'Request Header Fields Too Large', + 'The server is unwilling to process the request because its header ' + 'fields are too large') + + # server errors + INTERNAL_SERVER_ERROR = (500, 'Internal Server Error', + 'Server got itself in trouble') + NOT_IMPLEMENTED = (501, 'Not Implemented', + 'Server does not support this operation') + BAD_GATEWAY = (502, 'Bad Gateway', + 'Invalid responses from another server/proxy') + SERVICE_UNAVAILABLE = (503, 'Service Unavailable', + 'The server cannot process the request due to a high load') + GATEWAY_TIMEOUT = (504, 'Gateway Timeout', + 'The gateway server did not receive a timely response') + HTTP_VERSION_NOT_SUPPORTED = (505, 'HTTP Version Not Supported', + 'Cannot fulfill request') + VARIANT_ALSO_NEGOTIATES = 506, 'Variant Also Negotiates' + INSUFFICIENT_STORAGE = 507, 'Insufficient Storage' + LOOP_DETECTED = 508, 'Loop Detected' + NOT_EXTENDED = 510, 'Not Extended' + NETWORK_AUTHENTICATION_REQUIRED = (511, + 'Network Authentication Required', + 'The client needs to authenticate to gain network access') diff --git a/Lib/http/client.py b/Lib/http/client.py new file mode 100644 index 0000000000..a8e59b9561 --- /dev/null +++ b/Lib/http/client.py @@ -0,0 +1,1478 @@ +r"""HTTP/1.1 client library + + + + +HTTPConnection goes through a number of "states", which define when a client +may legally make another request or fetch the response for a particular +request. This diagram details these state transitions: + + (null) + | + | HTTPConnection() + v + Idle + | + | putrequest() + v + Request-started + | + | ( putheader() )* endheaders() + v + Request-sent + |\_____________________________ + | | getresponse() raises + | response = getresponse() | ConnectionError + v v + Unread-response Idle + [Response-headers-read] + |\____________________ + | | + | response.read() | putrequest() + v v + Idle Req-started-unread-response + ______/| + / | + response.read() | | ( putheader() )* endheaders() + v v + Request-started Req-sent-unread-response + | + | response.read() + v + Request-sent + +This diagram presents the following rules: + -- a second request may not be started until {response-headers-read} + -- a response [object] cannot be retrieved until {request-sent} + -- there is no differentiation between an unread response body and a + partially read response body + +Note: this enforcement is applied by the HTTPConnection class. The + HTTPResponse class does not enforce this state machine, which + implies sophisticated clients may accelerate the request/response + pipeline. Caution should be taken, though: accelerating the states + beyond the above pattern may imply knowledge of the server's + connection-close behavior for certain requests. For example, it + is impossible to tell whether the server will close the connection + UNTIL the response headers have been read; this means that further + requests cannot be placed into the pipeline until it is known that + the server will NOT be closing the connection. + +Logical State __state __response +------------- ------- ---------- +Idle _CS_IDLE None +Request-started _CS_REQ_STARTED None +Request-sent _CS_REQ_SENT None +Unread-response _CS_IDLE +Req-started-unread-response _CS_REQ_STARTED +Req-sent-unread-response _CS_REQ_SENT +""" + +import email.parser +import email.message +import http +import io +import os +import re +import socket +import collections +from urllib.parse import urlsplit + +# HTTPMessage, parse_headers(), and the HTTP status code constants are +# intentionally omitted for simplicity +__all__ = ["HTTPResponse", "HTTPConnection", + "HTTPException", "NotConnected", "UnknownProtocol", + "UnknownTransferEncoding", "UnimplementedFileMode", + "IncompleteRead", "InvalidURL", "ImproperConnectionState", + "CannotSendRequest", "CannotSendHeader", "ResponseNotReady", + "BadStatusLine", "LineTooLong", "RemoteDisconnected", "error", + "responses"] + +HTTP_PORT = 80 +HTTPS_PORT = 443 + +_UNKNOWN = 'UNKNOWN' + +# connection states +_CS_IDLE = 'Idle' +_CS_REQ_STARTED = 'Request-started' +_CS_REQ_SENT = 'Request-sent' + + +# hack to maintain backwards compatibility +globals().update(http.HTTPStatus.__members__) + +# another hack to maintain backwards compatibility +# Mapping status codes to official W3C names +responses = {v: v.phrase for v in http.HTTPStatus.__members__.values()} + +# maximal amount of data to read at one time in _safe_read +MAXAMOUNT = 1048576 + +# maximal line length when calling readline(). +_MAXLINE = 65536 +_MAXHEADERS = 100 + +# Header name/value ABNF (http://tools.ietf.org/html/rfc7230#section-3.2) +# +# VCHAR = %x21-7E +# obs-text = %x80-FF +# header-field = field-name ":" OWS field-value OWS +# field-name = token +# field-value = *( field-content / obs-fold ) +# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] +# field-vchar = VCHAR / obs-text +# +# obs-fold = CRLF 1*( SP / HTAB ) +# ; obsolete line folding +# ; see Section 3.2.4 + +# token = 1*tchar +# +# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" +# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" +# / DIGIT / ALPHA +# ; any VCHAR, except delimiters +# +# VCHAR defined in http://tools.ietf.org/html/rfc5234#appendix-B.1 + +# the patterns for both name and value are more lenient than RFC +# definitions to allow for backwards compatibility +_is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch +_is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search + +# We always set the Content-Length header for these methods because some +# servers will otherwise respond with a 411 +_METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'} + + +def _encode(data, name='data'): + """Call data.encode("latin-1") but show a better error message.""" + try: + return data.encode("latin-1") + except UnicodeEncodeError as err: + raise UnicodeEncodeError( + err.encoding, + err.object, + err.start, + err.end, + "%s (%.20r) is not valid Latin-1. Use %s.encode('utf-8') " + "if you want to send it encoded in UTF-8." % + (name.title(), data[err.start:err.end], name)) from None + + +class HTTPMessage(email.message.Message): + # XXX The only usage of this method is in + # http.server.CGIHTTPRequestHandler. Maybe move the code there so + # that it doesn't need to be part of the public API. The API has + # never been defined so this could cause backwards compatibility + # issues. + + def getallmatchingheaders(self, name): + """Find all header lines matching a given header name. + + Look through the list of headers and find all lines matching a given + header name (and their continuation lines). A list of the lines is + returned, without interpretation. If the header does not occur, an + empty list is returned. If the header occurs multiple times, all + occurrences are returned. Case is not important in the header name. + + """ + name = name.lower() + ':' + n = len(name) + lst = [] + hit = 0 + for line in self.keys(): + if line[:n].lower() == name: + hit = 1 + elif not line[:1].isspace(): + hit = 0 + if hit: + lst.append(line) + return lst + +def parse_headers(fp, _class=HTTPMessage): + """Parses only RFC2822 headers from a file pointer. + + email Parser wants to see strings rather than bytes. + But a TextIOWrapper around self.rfile would buffer too many bytes + from the stream, bytes which we later need to read as bytes. + So we read the correct bytes here, as bytes, for email Parser + to parse. + + """ + headers = [] + while True: + line = fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("header line") + headers.append(line) + if len(headers) > _MAXHEADERS: + raise HTTPException("got more than %d headers" % _MAXHEADERS) + if line in (b'\r\n', b'\n', b''): + break + hstring = b''.join(headers).decode('iso-8859-1') + return email.parser.Parser(_class=_class).parsestr(hstring) + + +class HTTPResponse(io.BufferedIOBase): + + # See RFC 2616 sec 19.6 and RFC 1945 sec 6 for details. + + # The bytes from the socket object are iso-8859-1 strings. + # See RFC 2616 sec 2.2 which notes an exception for MIME-encoded + # text following RFC 2047. The basic status line parsing only + # accepts iso-8859-1. + + def __init__(self, sock, debuglevel=0, method=None, url=None): + # If the response includes a content-length header, we need to + # make sure that the client doesn't read more than the + # specified number of bytes. If it does, it will block until + # the server times out and closes the connection. This will + # happen if a self.fp.read() is done (without a size) whether + # self.fp is buffered or not. So, no self.fp.read() by + # clients unless they know what they are doing. + self.fp = sock.makefile("rb") + self.debuglevel = debuglevel + self._method = method + + # The HTTPResponse object is returned via urllib. The clients + # of http and urllib expect different attributes for the + # headers. headers is used here and supports urllib. msg is + # provided as a backwards compatibility layer for http + # clients. + + self.headers = self.msg = None + + # from the Status-Line of the response + self.version = _UNKNOWN # HTTP-Version + self.status = _UNKNOWN # Status-Code + self.reason = _UNKNOWN # Reason-Phrase + + self.chunked = _UNKNOWN # is "chunked" being used? + self.chunk_left = _UNKNOWN # bytes left to read in current chunk + self.length = _UNKNOWN # number of bytes left in response + self.will_close = _UNKNOWN # conn will close at end of response + + def _read_status(self): + line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") + if len(line) > _MAXLINE: + raise LineTooLong("status line") + if self.debuglevel > 0: + print("reply:", repr(line)) + if not line: + # Presumably, the server closed the connection before + # sending a valid response. + raise RemoteDisconnected("Remote end closed connection without" + " response") + try: + version, status, reason = line.split(None, 2) + except ValueError: + try: + version, status = line.split(None, 1) + reason = "" + except ValueError: + # empty version will cause next test to fail. + version = "" + if not version.startswith("HTTP/"): + self._close_conn() + raise BadStatusLine(line) + + # The status code is a three-digit number + try: + status = int(status) + if status < 100 or status > 999: + raise BadStatusLine(line) + except ValueError: + raise BadStatusLine(line) + return version, status, reason + + def begin(self): + if self.headers is not None: + # we've already started reading the response + return + + # read until we get a non-100 response + while True: + version, status, reason = self._read_status() + if status != CONTINUE: + break + # skip the header from the 100 response + while True: + skip = self.fp.readline(_MAXLINE + 1) + if len(skip) > _MAXLINE: + raise LineTooLong("header line") + skip = skip.strip() + if not skip: + break + if self.debuglevel > 0: + print("header:", skip) + + self.code = self.status = status + self.reason = reason.strip() + if version in ("HTTP/1.0", "HTTP/0.9"): + # Some servers might still return "0.9", treat it as 1.0 anyway + self.version = 10 + elif version.startswith("HTTP/1."): + self.version = 11 # use HTTP/1.1 code for HTTP/1.x where x>=1 + else: + raise UnknownProtocol(version) + + self.headers = self.msg = parse_headers(self.fp) + + if self.debuglevel > 0: + for hdr in self.headers: + print("header:", hdr, end=" ") + + # are we using the chunked-style of transfer encoding? + tr_enc = self.headers.get("transfer-encoding") + if tr_enc and tr_enc.lower() == "chunked": + self.chunked = True + self.chunk_left = None + else: + self.chunked = False + + # will the connection close at the end of the response? + self.will_close = self._check_close() + + # do we have a Content-Length? + # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked" + self.length = None + length = self.headers.get("content-length") + + # are we using the chunked-style of transfer encoding? + tr_enc = self.headers.get("transfer-encoding") + if length and not self.chunked: + try: + self.length = int(length) + except ValueError: + self.length = None + else: + if self.length < 0: # ignore nonsensical negative lengths + self.length = None + else: + self.length = None + + # does the body have a fixed length? (of zero) + if (status == NO_CONTENT or status == NOT_MODIFIED or + 100 <= status < 200 or # 1xx codes + self._method == "HEAD"): + self.length = 0 + + # if the connection remains open, and we aren't using chunked, and + # a content-length was not provided, then assume that the connection + # WILL close. + if (not self.will_close and + not self.chunked and + self.length is None): + self.will_close = True + + def _check_close(self): + conn = self.headers.get("connection") + if self.version == 11: + # An HTTP/1.1 proxy is assumed to stay open unless + # explicitly closed. + conn = self.headers.get("connection") + if conn and "close" in conn.lower(): + return True + return False + + # Some HTTP/1.0 implementations have support for persistent + # connections, using rules different than HTTP/1.1. + + # For older HTTP, Keep-Alive indicates persistent connection. + if self.headers.get("keep-alive"): + return False + + # At least Akamai returns a "Connection: Keep-Alive" header, + # which was supposed to be sent by the client. + if conn and "keep-alive" in conn.lower(): + return False + + # Proxy-Connection is a netscape hack. + pconn = self.headers.get("proxy-connection") + if pconn and "keep-alive" in pconn.lower(): + return False + + # otherwise, assume it will close + return True + + def _close_conn(self): + fp = self.fp + self.fp = None + fp.close() + + def close(self): + try: + super().close() # set "closed" flag + finally: + if self.fp: + self._close_conn() + + # These implementations are for the benefit of io.BufferedReader. + + # XXX This class should probably be revised to act more like + # the "raw stream" that BufferedReader expects. + + def flush(self): + super().flush() + if self.fp: + self.fp.flush() + + def readable(self): + """Always returns True""" + return True + + # End of "raw stream" methods + + def isclosed(self): + """True if the connection is closed.""" + # NOTE: it is possible that we will not ever call self.close(). This + # case occurs when will_close is TRUE, length is None, and we + # read up to the last byte, but NOT past it. + # + # IMPLIES: if will_close is FALSE, then self.close() will ALWAYS be + # called, meaning self.isclosed() is meaningful. + return self.fp is None + + def read(self, amt=None): + if self.fp is None: + return b"" + + if self._method == "HEAD": + self._close_conn() + return b"" + + if amt is not None: + # Amount is given, implement using readinto + b = bytearray(amt) + n = self.readinto(b) + return memoryview(b)[:n].tobytes() + else: + # Amount is not given (unbounded read) so we must check self.length + # and self.chunked + + if self.chunked: + return self._readall_chunked() + + if self.length is None: + s = self.fp.read() + else: + try: + s = self._safe_read(self.length) + except IncompleteRead: + self._close_conn() + raise + self.length = 0 + self._close_conn() # we read everything + return s + + def readinto(self, b): + """Read up to len(b) bytes into bytearray b and return the number + of bytes read. + """ + + if self.fp is None: + return 0 + + if self._method == "HEAD": + self._close_conn() + return 0 + + if self.chunked: + return self._readinto_chunked(b) + + if self.length is not None: + if len(b) > self.length: + # clip the read to the "end of response" + b = memoryview(b)[0:self.length] + + # we do not use _safe_read() here because this may be a .will_close + # connection, and the user is reading more bytes than will be provided + # (for example, reading in 1k chunks) + n = self.fp.readinto(b) + if not n and b: + # Ideally, we would raise IncompleteRead if the content-length + # wasn't satisfied, but it might break compatibility. + self._close_conn() + elif self.length is not None: + self.length -= n + if not self.length: + self._close_conn() + return n + + def _read_next_chunk_size(self): + # Read the next chunk size from the file + line = self.fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("chunk size") + i = line.find(b";") + if i >= 0: + line = line[:i] # strip chunk-extensions + try: + return int(line, 16) + except ValueError: + # close the connection as protocol synchronisation is + # probably lost + self._close_conn() + raise + + def _read_and_discard_trailer(self): + # read and discard trailer up to the CRLF terminator + ### note: we shouldn't have any trailers! + while True: + line = self.fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("trailer line") + if not line: + # a vanishingly small number of sites EOF without + # sending the trailer + break + if line in (b'\r\n', b'\n', b''): + break + + def _get_chunk_left(self): + # return self.chunk_left, reading a new chunk if necessary. + # chunk_left == 0: at the end of the current chunk, need to close it + # chunk_left == None: No current chunk, should read next. + # This function returns non-zero or None if the last chunk has + # been read. + chunk_left = self.chunk_left + if not chunk_left: # Can be 0 or None + if chunk_left is not None: + # We are at the end of chunk. dicard chunk end + self._safe_read(2) # toss the CRLF at the end of the chunk + try: + chunk_left = self._read_next_chunk_size() + except ValueError: + raise IncompleteRead(b'') + if chunk_left == 0: + # last chunk: 1*("0") [ chunk-extension ] CRLF + self._read_and_discard_trailer() + # we read everything; close the "file" + self._close_conn() + chunk_left = None + self.chunk_left = chunk_left + return chunk_left + + def _readall_chunked(self): + assert self.chunked != _UNKNOWN + value = [] + try: + while True: + chunk_left = self._get_chunk_left() + if chunk_left is None: + break + value.append(self._safe_read(chunk_left)) + self.chunk_left = 0 + return b''.join(value) + except IncompleteRead: + raise IncompleteRead(b''.join(value)) + + def _readinto_chunked(self, b): + assert self.chunked != _UNKNOWN + total_bytes = 0 + mvb = memoryview(b) + try: + while True: + chunk_left = self._get_chunk_left() + if chunk_left is None: + return total_bytes + + if len(mvb) <= chunk_left: + n = self._safe_readinto(mvb) + self.chunk_left = chunk_left - n + return total_bytes + n + + temp_mvb = mvb[:chunk_left] + n = self._safe_readinto(temp_mvb) + mvb = mvb[n:] + total_bytes += n + self.chunk_left = 0 + + except IncompleteRead: + raise IncompleteRead(bytes(b[0:total_bytes])) + + def _safe_read(self, amt): + """Read the number of bytes requested, compensating for partial reads. + + Normally, we have a blocking socket, but a read() can be interrupted + by a signal (resulting in a partial read). + + Note that we cannot distinguish between EOF and an interrupt when zero + bytes have been read. IncompleteRead() will be raised in this + situation. + + This function should be used when bytes "should" be present for + reading. If the bytes are truly not available (due to EOF), then the + IncompleteRead exception can be used to detect the problem. + """ + s = [] + while amt > 0: + chunk = self.fp.read(min(amt, MAXAMOUNT)) + if not chunk: + raise IncompleteRead(b''.join(s), amt) + s.append(chunk) + amt -= len(chunk) + return b"".join(s) + + def _safe_readinto(self, b): + """Same as _safe_read, but for reading into a buffer.""" + total_bytes = 0 + mvb = memoryview(b) + while total_bytes < len(b): + if MAXAMOUNT < len(mvb): + temp_mvb = mvb[0:MAXAMOUNT] + n = self.fp.readinto(temp_mvb) + else: + n = self.fp.readinto(mvb) + if not n: + raise IncompleteRead(bytes(mvb[0:total_bytes]), len(b)) + mvb = mvb[n:] + total_bytes += n + return total_bytes + + def read1(self, n=-1): + """Read with at most one underlying system call. If at least one + byte is buffered, return that instead. + """ + if self.fp is None or self._method == "HEAD": + return b"" + if self.chunked: + return self._read1_chunked(n) + if self.length is not None and (n < 0 or n > self.length): + n = self.length + try: + result = self.fp.read1(n) + except ValueError: + if n >= 0: + raise + # some implementations, like BufferedReader, don't support -1 + # Read an arbitrarily selected largeish chunk. + result = self.fp.read1(16*1024) + if not result and n: + self._close_conn() + elif self.length is not None: + self.length -= len(result) + return result + + def peek(self, n=-1): + # Having this enables IOBase.readline() to read more than one + # byte at a time + if self.fp is None or self._method == "HEAD": + return b"" + if self.chunked: + return self._peek_chunked(n) + return self.fp.peek(n) + + def readline(self, limit=-1): + if self.fp is None or self._method == "HEAD": + return b"" + if self.chunked: + # Fallback to IOBase readline which uses peek() and read() + return super().readline(limit) + if self.length is not None and (limit < 0 or limit > self.length): + limit = self.length + result = self.fp.readline(limit) + if not result and limit: + self._close_conn() + elif self.length is not None: + self.length -= len(result) + return result + + def _read1_chunked(self, n): + # Strictly speaking, _get_chunk_left() may cause more than one read, + # but that is ok, since that is to satisfy the chunked protocol. + chunk_left = self._get_chunk_left() + if chunk_left is None or n == 0: + return b'' + if not (0 <= n <= chunk_left): + n = chunk_left # if n is negative or larger than chunk_left + read = self.fp.read1(n) + self.chunk_left -= len(read) + if not read: + raise IncompleteRead(b"") + return read + + def _peek_chunked(self, n): + # Strictly speaking, _get_chunk_left() may cause more than one read, + # but that is ok, since that is to satisfy the chunked protocol. + try: + chunk_left = self._get_chunk_left() + except IncompleteRead: + return b'' # peek doesn't worry about protocol + if chunk_left is None: + return b'' # eof + # peek is allowed to return more than requested. Just request the + # entire chunk, and truncate what we get. + return self.fp.peek(chunk_left)[:chunk_left] + + def fileno(self): + return self.fp.fileno() + + def getheader(self, name, default=None): + '''Returns the value of the header matching *name*. + + If there are multiple matching headers, the values are + combined into a single string separated by commas and spaces. + + If no matching header is found, returns *default* or None if + the *default* is not specified. + + If the headers are unknown, raises http.client.ResponseNotReady. + + ''' + if self.headers is None: + raise ResponseNotReady() + headers = self.headers.get_all(name) or default + if isinstance(headers, str) or not hasattr(headers, '__iter__'): + return headers + else: + return ', '.join(headers) + + def getheaders(self): + """Return list of (header, value) tuples.""" + if self.headers is None: + raise ResponseNotReady() + return list(self.headers.items()) + + # We override IOBase.__iter__ so that it doesn't check for closed-ness + + def __iter__(self): + return self + + # For compatibility with old-style urllib responses. + + def info(self): + '''Returns an instance of the class mimetools.Message containing + meta-information associated with the URL. + + When the method is HTTP, these headers are those returned by + the server at the head of the retrieved HTML page (including + Content-Length and Content-Type). + + When the method is FTP, a Content-Length header will be + present if (as is now usual) the server passed back a file + length in response to the FTP retrieval request. A + Content-Type header will be present if the MIME type can be + guessed. + + When the method is local-file, returned headers will include + a Date representing the file's last-modified time, a + Content-Length giving file size, and a Content-Type + containing a guess at the file's type. See also the + description of the mimetools module. + + ''' + return self.headers + + def geturl(self): + '''Return the real URL of the page. + + In some cases, the HTTP server redirects a client to another + URL. The urlopen() function handles this transparently, but in + some cases the caller needs to know which URL the client was + redirected to. The geturl() method can be used to get at this + redirected URL. + + ''' + return self.url + + def getcode(self): + '''Return the HTTP status code that was sent with the response, + or None if the URL is not an HTTP URL. + + ''' + return self.status + +class HTTPConnection: + + _http_vsn = 11 + _http_vsn_str = 'HTTP/1.1' + + response_class = HTTPResponse + default_port = HTTP_PORT + auto_open = 1 + debuglevel = 0 + + @staticmethod + def _is_textIO(stream): + """Test whether a file-like object is a text or a binary stream. + """ + return isinstance(stream, io.TextIOBase) + + @staticmethod + def _get_content_length(body, method): + """Get the content-length based on the body. + + If the body is None, we set Content-Length: 0 for methods that expect + a body (RFC 7230, Section 3.3.2). We also set the Content-Length for + any method if the body is a str or bytes-like object and not a file. + """ + if body is None: + # do an explicit check for not None here to distinguish + # between unset and set but empty + if method.upper() in _METHODS_EXPECTING_BODY: + return 0 + else: + return None + + if hasattr(body, 'read'): + # file-like object. + return None + + try: + # does it implement the buffer protocol (bytes, bytearray, array)? + mv = memoryview(body) + return mv.nbytes + except TypeError: + pass + + if isinstance(body, str): + return len(body) + + return None + + def __init__(self, host, port=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + source_address=None): + self.timeout = timeout + self.source_address = source_address + self.sock = None + self._buffer = [] + self.__response = None + self.__state = _CS_IDLE + self._method = None + self._tunnel_host = None + self._tunnel_port = None + self._tunnel_headers = {} + + (self.host, self.port) = self._get_hostport(host, port) + + # This is stored as an instance variable to allow unit + # tests to replace it with a suitable mockup + self._create_connection = socket.create_connection + + def set_tunnel(self, host, port=None, headers=None): + """Set up host and port for HTTP CONNECT tunnelling. + + In a connection that uses HTTP CONNECT tunneling, the host passed to the + constructor is used as a proxy server that relays all communication to + the endpoint passed to `set_tunnel`. This done by sending an HTTP + CONNECT request to the proxy server when the connection is established. + + This method must be called before the HTML connection has been + established. + + The headers argument should be a mapping of extra HTTP headers to send + with the CONNECT request. + """ + + if self.sock: + raise RuntimeError("Can't set up tunnel for established connection") + + self._tunnel_host, self._tunnel_port = self._get_hostport(host, port) + if headers: + self._tunnel_headers = headers + else: + self._tunnel_headers.clear() + + def _get_hostport(self, host, port): + if port is None: + i = host.rfind(':') + j = host.rfind(']') # ipv6 addresses have [...] + if i > j: + try: + port = int(host[i+1:]) + except ValueError: + if host[i+1:] == "": # http://foo.com:/ == http://foo.com/ + port = self.default_port + else: + raise InvalidURL("nonnumeric port: '%s'" % host[i+1:]) + host = host[:i] + else: + port = self.default_port + if host and host[0] == '[' and host[-1] == ']': + host = host[1:-1] + + return (host, port) + + def set_debuglevel(self, level): + self.debuglevel = level + + def _tunnel(self): + connect_str = "CONNECT %s:%d HTTP/1.0\r\n" % (self._tunnel_host, + self._tunnel_port) + connect_bytes = connect_str.encode("ascii") + self.send(connect_bytes) + for header, value in self._tunnel_headers.items(): + header_str = "%s: %s\r\n" % (header, value) + header_bytes = header_str.encode("latin-1") + self.send(header_bytes) + self.send(b'\r\n') + + response = self.response_class(self.sock, method=self._method) + (version, code, message) = response._read_status() + + if code != http.HTTPStatus.OK: + self.close() + raise OSError("Tunnel connection failed: %d %s" % (code, + message.strip())) + while True: + line = response.fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("header line") + if not line: + # for sites which EOF without sending a trailer + break + if line in (b'\r\n', b'\n', b''): + break + + if self.debuglevel > 0: + print('header:', line.decode()) + + def connect(self): + """Connect to the host and port specified in __init__.""" + self.sock = self._create_connection( + (self.host,self.port), self.timeout, self.source_address) + self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + + if self._tunnel_host: + self._tunnel() + + def close(self): + """Close the connection to the HTTP server.""" + self.__state = _CS_IDLE + try: + sock = self.sock + if sock: + self.sock = None + sock.close() # close it manually... there may be other refs + finally: + response = self.__response + if response: + self.__response = None + response.close() + + def send(self, data): + """Send `data' to the server. + ``data`` can be a string object, a bytes object, an array object, a + file-like object that supports a .read() method, or an iterable object. + """ + + if self.sock is None: + if self.auto_open: + self.connect() + else: + raise NotConnected() + + if self.debuglevel > 0: + print("send:", repr(data)) + blocksize = 8192 + if hasattr(data, "read") : + if self.debuglevel > 0: + print("sendIng a read()able") + encode = self._is_textIO(data) + if encode and self.debuglevel > 0: + print("encoding file using iso-8859-1") + while 1: + datablock = data.read(blocksize) + if not datablock: + break + if encode: + datablock = datablock.encode("iso-8859-1") + self.sock.sendall(datablock) + return + try: + self.sock.sendall(data) + except TypeError: + if isinstance(data, collections.Iterable): + for d in data: + self.sock.sendall(d) + else: + raise TypeError("data should be a bytes-like object " + "or an iterable, got %r" % type(data)) + + def _output(self, s): + """Add a line of output to the current request buffer. + + Assumes that the line does *not* end with \\r\\n. + """ + self._buffer.append(s) + + def _read_readable(self, readable): + blocksize = 8192 + if self.debuglevel > 0: + print("sendIng a read()able") + encode = self._is_textIO(readable) + if encode and self.debuglevel > 0: + print("encoding file using iso-8859-1") + while True: + datablock = readable.read(blocksize) + if not datablock: + break + if encode: + datablock = datablock.encode("iso-8859-1") + yield datablock + + def _send_output(self, message_body=None, encode_chunked=False): + """Send the currently buffered request and clear the buffer. + + Appends an extra \\r\\n to the buffer. + A message_body may be specified, to be appended to the request. + """ + self._buffer.extend((b"", b"")) + msg = b"\r\n".join(self._buffer) + del self._buffer[:] + self.send(msg) + + if message_body is not None: + + # create a consistent interface to message_body + if hasattr(message_body, 'read'): + # Let file-like take precedence over byte-like. This + # is needed to allow the current position of mmap'ed + # files to be taken into account. + chunks = self._read_readable(message_body) + else: + try: + # this is solely to check to see if message_body + # implements the buffer API. it /would/ be easier + # to capture if PyObject_CheckBuffer was exposed + # to Python. + memoryview(message_body) + except TypeError: + try: + chunks = iter(message_body) + except TypeError: + raise TypeError("message_body should be a bytes-like " + "object or an iterable, got %r" + % type(message_body)) + else: + # the object implements the buffer interface and + # can be passed directly into socket methods + chunks = (message_body,) + + for chunk in chunks: + if not chunk: + if self.debuglevel > 0: + print('Zero length chunk ignored') + continue + + if encode_chunked and self._http_vsn == 11: + # chunked encoding + chunk = f'{len(chunk):X}\r\n'.encode('ascii') + chunk \ + + b'\r\n' + self.send(chunk) + + if encode_chunked and self._http_vsn == 11: + # end chunked transfer + self.send(b'0\r\n\r\n') + + def putrequest(self, method, url, skip_host=False, + skip_accept_encoding=False): + """Send a request to the server. + + `method' specifies an HTTP request method, e.g. 'GET'. + `url' specifies the object being requested, e.g. '/index.html'. + `skip_host' if True does not add automatically a 'Host:' header + `skip_accept_encoding' if True does not add automatically an + 'Accept-Encoding:' header + """ + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + + # in certain cases, we cannot issue another request on this connection. + # this occurs when: + # 1) we are in the process of sending a request. (_CS_REQ_STARTED) + # 2) a response to a previous request has signalled that it is going + # to close the connection upon completion. + # 3) the headers for the previous response have not been read, thus + # we cannot determine whether point (2) is true. (_CS_REQ_SENT) + # + # if there is no prior response, then we can request at will. + # + # if point (2) is true, then we will have passed the socket to the + # response (effectively meaning, "there is no prior response"), and + # will open a new one when a new request is made. + # + # Note: if a prior response exists, then we *can* start a new request. + # We are not allowed to begin fetching the response to this new + # request, however, until that prior response is complete. + # + if self.__state == _CS_IDLE: + self.__state = _CS_REQ_STARTED + else: + raise CannotSendRequest(self.__state) + + # Save the method we use, we need it later in the response phase + self._method = method + if not url: + url = '/' + request = '%s %s %s' % (method, url, self._http_vsn_str) + + # Non-ASCII characters should have been eliminated earlier + self._output(request.encode('ascii')) + + if self._http_vsn == 11: + # Issue some standard headers for better HTTP/1.1 compliance + + if not skip_host: + # this header is issued *only* for HTTP/1.1 + # connections. more specifically, this means it is + # only issued when the client uses the new + # HTTPConnection() class. backwards-compat clients + # will be using HTTP/1.0 and those clients may be + # issuing this header themselves. we should NOT issue + # it twice; some web servers (such as Apache) barf + # when they see two Host: headers + + # If we need a non-standard port,include it in the + # header. If the request is going through a proxy, + # but the host of the actual URL, not the host of the + # proxy. + + netloc = '' + if url.startswith('http'): + nil, netloc, nil, nil, nil = urlsplit(url) + + if netloc: + try: + netloc_enc = netloc.encode("ascii") + except UnicodeEncodeError: + netloc_enc = netloc.encode("idna") + self.putheader('Host', netloc_enc) + else: + if self._tunnel_host: + host = self._tunnel_host + port = self._tunnel_port + else: + host = self.host + port = self.port + + try: + host_enc = host.encode("ascii") + except UnicodeEncodeError: + host_enc = host.encode("idna") + + # As per RFC 273, IPv6 address should be wrapped with [] + # when used as Host header + + if host.find(':') >= 0: + host_enc = b'[' + host_enc + b']' + + if port == self.default_port: + self.putheader('Host', host_enc) + else: + host_enc = host_enc.decode("ascii") + self.putheader('Host', "%s:%s" % (host_enc, port)) + + # note: we are assuming that clients will not attempt to set these + # headers since *this* library must deal with the + # consequences. this also means that when the supporting + # libraries are updated to recognize other forms, then this + # code should be changed (removed or updated). + + # we only want a Content-Encoding of "identity" since we don't + # support encodings such as x-gzip or x-deflate. + if not skip_accept_encoding: + self.putheader('Accept-Encoding', 'identity') + + # we can accept "chunked" Transfer-Encodings, but no others + # NOTE: no TE header implies *only* "chunked" + #self.putheader('TE', 'chunked') + + # if TE is supplied in the header, then it must appear in a + # Connection header. + #self.putheader('Connection', 'TE') + + else: + # For HTTP/1.0, the server will assume "not chunked" + pass + + def putheader(self, header, *values): + """Send a request header line to the server. + + For example: h.putheader('Accept', 'text/html') + """ + if self.__state != _CS_REQ_STARTED: + raise CannotSendHeader() + + if hasattr(header, 'encode'): + header = header.encode('ascii') + + if not _is_legal_header_name(header): + raise ValueError('Invalid header name %r' % (header,)) + + values = list(values) + for i, one_value in enumerate(values): + if hasattr(one_value, 'encode'): + values[i] = one_value.encode('latin-1') + elif isinstance(one_value, int): + values[i] = str(one_value).encode('ascii') + + if _is_illegal_header_value(values[i]): + raise ValueError('Invalid header value %r' % (values[i],)) + + value = b'\r\n\t'.join(values) + header = header + b': ' + value + self._output(header) + + def endheaders(self, message_body=None, *, encode_chunked=False): + """Indicate that the last header line has been sent to the server. + + This method sends the request to the server. The optional message_body + argument can be used to pass a message body associated with the + request. + """ + if self.__state == _CS_REQ_STARTED: + self.__state = _CS_REQ_SENT + else: + raise CannotSendHeader() + self._send_output(message_body, encode_chunked=encode_chunked) + + def request(self, method, url, body=None, headers={}, *, + encode_chunked=False): + """Send a complete request to the server.""" + self._send_request(method, url, body, headers, encode_chunked) + + def _send_request(self, method, url, body, headers, encode_chunked): + # Honor explicitly requested Host: and Accept-Encoding: headers. + header_names = frozenset(k.lower() for k in headers) + skips = {} + if 'host' in header_names: + skips['skip_host'] = 1 + if 'accept-encoding' in header_names: + skips['skip_accept_encoding'] = 1 + + self.putrequest(method, url, **skips) + + # chunked encoding will happen if HTTP/1.1 is used and either + # the caller passes encode_chunked=True or the following + # conditions hold: + # 1. content-length has not been explicitly set + # 2. the body is a file or iterable, but not a str or bytes-like + # 3. Transfer-Encoding has NOT been explicitly set by the caller + + if 'content-length' not in header_names: + # only chunk body if not explicitly set for backwards + # compatibility, assuming the client code is already handling the + # chunking + if 'transfer-encoding' not in header_names: + # if content-length cannot be automatically determined, fall + # back to chunked encoding + encode_chunked = False + content_length = self._get_content_length(body, method) + if content_length is None: + if body is not None: + if self.debuglevel > 0: + print('Unable to determine size of %r' % body) + encode_chunked = True + self.putheader('Transfer-Encoding', 'chunked') + else: + self.putheader('Content-Length', str(content_length)) + else: + encode_chunked = False + + for hdr, value in headers.items(): + self.putheader(hdr, value) + if isinstance(body, str): + # RFC 2616 Section 3.7.1 says that text default has a + # default charset of iso-8859-1. + body = _encode(body, 'body') + self.endheaders(body, encode_chunked=encode_chunked) + + def getresponse(self): + """Get the response from the server. + + If the HTTPConnection is in the correct state, returns an + instance of HTTPResponse or of whatever object is returned by + the response_class variable. + + If a request has not been sent or if a previous response has + not be handled, ResponseNotReady is raised. If the HTTP + response indicates that the connection should be closed, then + it will be closed before the response is returned. When the + connection is closed, the underlying socket is closed. + """ + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + # if a prior response exists, then it must be completed (otherwise, we + # cannot read this response's header to determine the connection-close + # behavior) + # + # note: if a prior response existed, but was connection-close, then the + # socket and response were made independent of this HTTPConnection + # object since a new request requires that we open a whole new + # connection + # + # this means the prior response had one of two states: + # 1) will_close: this connection was reset and the prior socket and + # response operate independently + # 2) persistent: the response was retained and we await its + # isclosed() status to become true. + # + if self.__state != _CS_REQ_SENT or self.__response: + raise ResponseNotReady(self.__state) + + if self.debuglevel > 0: + response = self.response_class(self.sock, self.debuglevel, + method=self._method) + else: + response = self.response_class(self.sock, method=self._method) + + try: + try: + response.begin() + except ConnectionError: + self.close() + raise + assert response.will_close != _UNKNOWN + self.__state = _CS_IDLE + + if response.will_close: + # this effectively passes the connection to the response + self.close() + else: + # remember this, so we can tell when it is complete + self.__response = response + + return response + except: + response.close() + raise + +try: + import ssl +except ImportError: + pass +else: + class HTTPSConnection(HTTPConnection): + "This class allows communication via SSL." + + default_port = HTTPS_PORT + + # XXX Should key_file and cert_file be deprecated in favour of context? + + def __init__(self, host, port=None, key_file=None, cert_file=None, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + source_address=None, *, context=None, + check_hostname=None): + super(HTTPSConnection, self).__init__(host, port, timeout, + source_address) + if (key_file is not None or cert_file is not None or + check_hostname is not None): + import warnings + warnings.warn("key_file, cert_file and check_hostname are " + "deprecated, use a custom context instead.", + DeprecationWarning, 2) + self.key_file = key_file + self.cert_file = cert_file + if context is None: + context = ssl._create_default_https_context() + will_verify = context.verify_mode != ssl.CERT_NONE + if check_hostname is None: + check_hostname = context.check_hostname + if check_hostname and not will_verify: + raise ValueError("check_hostname needs a SSL context with " + "either CERT_OPTIONAL or CERT_REQUIRED") + if key_file or cert_file: + context.load_cert_chain(cert_file, key_file) + self._context = context + self._check_hostname = check_hostname + + def connect(self): + "Connect to a host on a given (SSL) port." + + super().connect() + + if self._tunnel_host: + server_hostname = self._tunnel_host + else: + server_hostname = self.host + + self.sock = self._context.wrap_socket(self.sock, + server_hostname=server_hostname) + if not self._context.check_hostname and self._check_hostname: + try: + ssl.match_hostname(self.sock.getpeercert(), server_hostname) + except Exception: + self.sock.shutdown(socket.SHUT_RDWR) + self.sock.close() + raise + + __all__.append("HTTPSConnection") + +class HTTPException(Exception): + # Subclasses that define an __init__ must call Exception.__init__ + # or define self.args. Otherwise, str() will fail. + pass + +class NotConnected(HTTPException): + pass + +class InvalidURL(HTTPException): + pass + +class UnknownProtocol(HTTPException): + def __init__(self, version): + self.args = version, + self.version = version + +class UnknownTransferEncoding(HTTPException): + pass + +class UnimplementedFileMode(HTTPException): + pass + +class IncompleteRead(HTTPException): + def __init__(self, partial, expected=None): + self.args = partial, + self.partial = partial + self.expected = expected + def __repr__(self): + if self.expected is not None: + e = ', %i more expected' % self.expected + else: + e = '' + return '%s(%i bytes read%s)' % (self.__class__.__name__, + len(self.partial), e) + def __str__(self): + return repr(self) + +class ImproperConnectionState(HTTPException): + pass + +class CannotSendRequest(ImproperConnectionState): + pass + +class CannotSendHeader(ImproperConnectionState): + pass + +class ResponseNotReady(ImproperConnectionState): + pass + +class BadStatusLine(HTTPException): + def __init__(self, line): + if not line: + line = repr(line) + self.args = line, + self.line = line + +class LineTooLong(HTTPException): + def __init__(self, line_type): + HTTPException.__init__(self, "got more than %d bytes when reading %s" + % (_MAXLINE, line_type)) + +class RemoteDisconnected(ConnectionResetError, BadStatusLine): + def __init__(self, *pos, **kw): + BadStatusLine.__init__(self, "") + ConnectionResetError.__init__(self, *pos, **kw) + +# for backwards compatibility +error = HTTPException diff --git a/Lib/http/cookiejar.py b/Lib/http/cookiejar.py new file mode 100644 index 0000000000..adf956d66a --- /dev/null +++ b/Lib/http/cookiejar.py @@ -0,0 +1,2098 @@ +r"""HTTP cookie handling for web clients. + +This module has (now fairly distant) origins in Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + +Class diagram (note that BSDDBCookieJar and the MSIE* classes are not +distributed with the Python standard library, but are available from +http://wwwsearch.sf.net/): + + CookieJar____ + / \ \ + FileCookieJar \ \ + / | \ \ \ + MozillaCookieJar | LWPCookieJar \ \ + | | \ + | ---MSIEBase | \ + | / | | \ + | / MSIEDBCookieJar BSDDBCookieJar + |/ + MSIECookieJar + +""" + +__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', + 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] + +import copy +import datetime +import re +import time +import urllib.parse, urllib.request +try: + import threading as _threading +except ImportError: + import dummy_threading as _threading +import http.client # only for the default HTTP port +from calendar import timegm + +debug = False # set to True to enable debugging via the logging module +logger = None + +def _debug(*args): + if not debug: + return + global logger + if not logger: + import logging + logger = logging.getLogger("http.cookiejar") + return logger.debug(*args) + + +DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT) +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") + +def _warn_unhandled_exception(): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. Warn if any + # exceptions are caught there. + import io, warnings, traceback + f = io.StringIO() + traceback.print_exc(None, f) + msg = f.getvalue() + warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) + + +# Date/time conversion +# ----------------------------------------------------------------------------- + +EPOCH_YEAR = 1970 +def _timegm(tt): + year, month, mday, hour, min, sec = tt[:6] + if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and + (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): + return timegm(tt) + else: + return None + +DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] +MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] +MONTHS_LOWER = [] +for month in MONTHS: MONTHS_LOWER.append(month.lower()) + +def time2isoz(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", + representing Universal Time (UTC, aka GMT). An example of this format is: + + 1994-11-24 08:49:37Z + + """ + if t is None: + dt = datetime.datetime.utcnow() + else: + dt = datetime.datetime.utcfromtimestamp(t) + return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( + dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) + +def time2netscape(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like this: + + Wed, DD-Mon-YYYY HH:MM:SS GMT + + """ + if t is None: + dt = datetime.datetime.utcnow() + else: + dt = datetime.datetime.utcfromtimestamp(t) + return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % ( + DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1], + dt.year, dt.hour, dt.minute, dt.second) + + +UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} + +TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) +def offset_from_tz_string(tz): + offset = None + if tz in UTC_ZONES: + offset = 0 + else: + m = TIMEZONE_RE.search(tz) + if m: + offset = 3600 * int(m.group(2)) + if m.group(3): + offset = offset + 60 * int(m.group(3)) + if m.group(1) == '-': + offset = -offset + return offset + +def _str2time(day, mon, yr, hr, min, sec, tz): + yr = int(yr) + if yr > datetime.MAXYEAR: + return None + + # translate month name to number + # month numbers start with 1 (January) + try: + mon = MONTHS_LOWER.index(mon.lower())+1 + except ValueError: + # maybe it's already a number + try: + imon = int(mon) + except ValueError: + return None + if 1 <= imon <= 12: + mon = imon + else: + return None + + # make sure clock elements are defined + if hr is None: hr = 0 + if min is None: min = 0 + if sec is None: sec = 0 + + day = int(day) + hr = int(hr) + min = int(min) + sec = int(sec) + + if yr < 1000: + # find "obvious" year + cur_yr = time.localtime(time.time())[0] + m = cur_yr % 100 + tmp = yr + yr = yr + cur_yr - m + m = m - tmp + if abs(m) > 50: + if m > 0: yr = yr + 100 + else: yr = yr - 100 + + # convert UTC time tuple to seconds since epoch (not timezone-adjusted) + t = _timegm((yr, mon, day, hr, min, sec, tz)) + + if t is not None: + # adjust time using timezone string, to get absolute time since epoch + if tz is None: + tz = "UTC" + tz = tz.upper() + offset = offset_from_tz_string(tz) + if offset is None: + return None + t = t - offset + + return t + +STRICT_DATE_RE = re.compile( + r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " + r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) +WEEKDAY_RE = re.compile( + r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) +LOOSE_HTTP_DATE_RE = re.compile( + r"""^ + (\d\d?) # day + (?:\s+|[-\/]) + (\w+) # month + (?:\s+|[-\/]) + (\d+) # year + (?: + (?:\s+|:) # separator before clock + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + )? # optional clock + \s* + ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone + \s* + (?:\(\w+\))? # ASCII representation of timezone in parens. + \s*$""", re.X | re.ASCII) +def http2time(text): + """Returns time in seconds since epoch of time represented by a string. + + Return value is an integer. + + None is returned if the format of str is unrecognized, the time is outside + the representable range, or the timezone string is not recognized. If the + string contains no timezone, UTC is assumed. + + The timezone in the string may be numerical (like "-0800" or "+0100") or a + string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the + timezone strings equivalent to UTC (zero offset) are known to the function. + + The function loosely parses the following formats: + + Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format + Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format + Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format + 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) + 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) + 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) + + The parser ignores leading and trailing whitespace. The time may be + absent. + + If the year is given with only 2 digits, the function will select the + century that makes the year closest to the current date. + + """ + # fast exit for strictly conforming string + m = STRICT_DATE_RE.search(text) + if m: + g = m.groups() + mon = MONTHS_LOWER.index(g[1].lower()) + 1 + tt = (int(g[2]), mon, int(g[0]), + int(g[3]), int(g[4]), float(g[5])) + return _timegm(tt) + + # No, we need some messy parsing... + + # clean up + text = text.lstrip() + text = WEEKDAY_RE.sub("", text, 1) # Useless weekday + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = LOOSE_HTTP_DATE_RE.search(text) + if m is not None: + day, mon, yr, hr, min, sec, tz = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + +ISO_DATE_RE = re.compile( + r"""^ + (\d{4}) # year + [-\/]? + (\d\d?) # numerical month + [-\/]? + (\d\d?) # day + (?: + (?:\s+|[-:Tt]) # separator before clock + (\d\d?):?(\d\d) # hour:min + (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) + )? # optional clock + \s* + ([-+]?\d\d?:?(:?\d\d)? + |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) + \s*$""", re.X | re. ASCII) +def iso2time(text): + """ + As for http2time, but parses the ISO 8601 formats: + + 1994-02-03 14:15:29 -0100 -- ISO 8601 format + 1994-02-03 14:15:29 -- zone is optional + 1994-02-03 -- only date + 1994-02-03T14:15:29 -- Use T as separator + 19940203T141529Z -- ISO 8601 compact format + 19940203 -- only date + + """ + # clean up + text = text.lstrip() + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = ISO_DATE_RE.search(text) + if m is not None: + # XXX there's an extra bit of the timezone I'm ignoring here: is + # this the right thing to do? + yr, mon, day, hr, min, sec, tz, _ = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + + +# Header parsing +# ----------------------------------------------------------------------------- + +def unmatched(match): + """Return unmatched part of re.Match object.""" + start, end = match.span(0) + return match.string[:start]+match.string[end:] + +HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") +HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") +HEADER_ESCAPE_RE = re.compile(r"\\(.)") +def split_header_words(header_values): + r"""Parse header values into a list of lists containing key,value pairs. + + The function knows how to deal with ",", ";" and "=" as well as quoted + values after "=". A list of space separated tokens are parsed as if they + were separated by ";". + + If the header_values passed as argument contains multiple values, then they + are treated as if they were a single value separated by comma ",". + + This means that this function is useful for parsing header fields that + follow this syntax (BNF as from the HTTP/1.1 specification, but we relax + the requirement for tokens). + + headers = #header + header = (token | parameter) *( [";"] (token | parameter)) + + token = 1* + separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + + quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) + qdtext = > + quoted-pair = "\" CHAR + + parameter = attribute "=" value + attribute = token + value = token | quoted-string + + Each header is represented by a list of key/value pairs. The value for a + simple token (not part of a parameter) is None. Syntactically incorrect + headers will not necessarily be parsed as you would want. + + This is easier to describe with some examples: + + >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) + [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] + >>> split_header_words(['text/html; charset="iso-8859-1"']) + [[('text/html', None), ('charset', 'iso-8859-1')]] + >>> split_header_words([r'Basic realm="\"foo\bar\""']) + [[('Basic', None), ('realm', '"foobar"')]] + + """ + assert not isinstance(header_values, str) + result = [] + for text in header_values: + orig_text = text + pairs = [] + while text: + m = HEADER_TOKEN_RE.search(text) + if m: + text = unmatched(m) + name = m.group(1) + m = HEADER_QUOTED_VALUE_RE.search(text) + if m: # quoted value + text = unmatched(m) + value = m.group(1) + value = HEADER_ESCAPE_RE.sub(r"\1", value) + else: + m = HEADER_VALUE_RE.search(text) + if m: # unquoted value + text = unmatched(m) + value = m.group(1) + value = value.rstrip() + else: + # no value, a lone token + value = None + pairs.append((name, value)) + elif text.lstrip().startswith(","): + # concatenated headers, as per RFC 2616 section 4.2 + text = text.lstrip()[1:] + if pairs: result.append(pairs) + pairs = [] + else: + # skip junk + non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text) + assert nr_junk_chars > 0, ( + "split_header_words bug: '%s', '%s', %s" % + (orig_text, text, pairs)) + text = non_junk + if pairs: result.append(pairs) + return result + +HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") +def join_header_words(lists): + """Do the inverse (almost) of the conversion done by split_header_words. + + Takes a list of lists of (key, value) pairs and produces a single header + value. Attribute values are quoted if needed. + + >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]]) + 'text/plain; charset="iso-8859-1"' + >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]]) + 'text/plain, charset="iso-8859-1"' + + """ + headers = [] + for pairs in lists: + attr = [] + for k, v in pairs: + if v is not None: + if not re.search(r"^\w+$", v): + v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ + v = '"%s"' % v + k = "%s=%s" % (k, v) + attr.append(k) + if attr: headers.append("; ".join(attr)) + return ", ".join(headers) + +def strip_quotes(text): + if text.startswith('"'): + text = text[1:] + if text.endswith('"'): + text = text[:-1] + return text + +def parse_ns_headers(ns_headers): + """Ad-hoc parser for Netscape protocol cookie-attributes. + + The old Netscape cookie format for Set-Cookie can for instance contain + an unquoted "," in the expires field, so we have to use this ad-hoc + parser instead of split_header_words. + + XXX This may not make the best possible effort to parse all the crap + that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient + parser is probably better, so could do worse than following that if + this ever gives any trouble. + + Currently, this is also used for parsing RFC 2109 cookies. + + """ + known_attrs = ("expires", "domain", "path", "secure", + # RFC 2109 attrs (may turn up in Netscape cookies, too) + "version", "port", "max-age") + + result = [] + for ns_header in ns_headers: + pairs = [] + version_set = False + + # XXX: The following does not strictly adhere to RFCs in that empty + # names and values are legal (the former will only appear once and will + # be overwritten if multiple occurrences are present). This is + # mostly to deal with backwards compatibility. + for ii, param in enumerate(ns_header.split(';')): + param = param.strip() + + key, sep, val = param.partition('=') + key = key.strip() + + if not key: + if ii == 0: + break + else: + continue + + # allow for a distinction between present and empty and missing + # altogether + val = val.strip() if sep else None + + if ii != 0: + lc = key.lower() + if lc in known_attrs: + key = lc + + if key == "version": + # This is an RFC 2109 cookie. + if val is not None: + val = strip_quotes(val) + version_set = True + elif key == "expires": + # convert expires date to seconds since epoch + if val is not None: + val = http2time(strip_quotes(val)) # None if invalid + pairs.append((key, val)) + + if pairs: + if not version_set: + pairs.append(("version", "0")) + result.append(pairs) + + return result + + +IPV4_RE = re.compile(r"\.\d+$", re.ASCII) +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + if IPV4_RE.search(text): + return False + if text == "": + return False + if text[0] == "." or text[-1] == ".": + return False + return True + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + if i == -1 or i == 0: + # A does not have form NB, or N is the empty string + return False + if not B.startswith("."): + return False + if not is_HDN(B[1:]): + return False + return True + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + if IPV4_RE.search(text): + return False + return True + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$", re.ASCII) +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = urllib.parse.urlparse(url)[1] + if host == "": + host = request.get_header("Host", "") + + # remove port, if present + host = cut_port_re.sub("", host, 1) + return host.lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name). + + As defined by RFC 2965, except both are lowercased. + + """ + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def request_path(request): + """Path component of request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + parts = urllib.parse.urlsplit(url) + path = escape_path(parts.path) + if not path.startswith("/"): + # fix bad RFC 2396 absoluteURI + path = "/" + path + return path + +def request_port(request): + host = request.host + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + _debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + path = urllib.parse.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host(request) + if not domain_match(req_host, reach(request.origin_req_host)): + return True + else: + return False + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest, + rfc2109=False, + ): + + if version is not None: version = int(version) + if expires is not None: expires = int(float(expires)) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + self.rfc2109 = rfc2109 + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return name in self._rest + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + + def is_expired(self, now=None): + if now is None: now = time.time() + if (self.expires is not None) and (self.expires <= now): + return True + return False + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ("version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ): + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + args.append("rfc2109=%s" % repr(self.rfc2109)) + return "%s(%s)" % (self.__class__.__name__, ", ".join(args)) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies, though this is probably a bad idea. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customized policy. + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server.""" + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies.""" + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + rfc2109_as_netscape=None, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """Constructor arguments should be passed as keyword arguments only.""" + self.netscape = netscape + self.rfc2965 = rfc2965 + self.rfc2109_as_netscape = rfc2109_as_netscape + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override .set_ok(), be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + _debug(" - checking cookie %s=%s", cookie.name, cookie.value) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + _debug(" Set-Cookie2 without version attribute (%s=%s)", + cookie.name, cookie.value) + return False + if cookie.version > 0 and not self.rfc2965: + _debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + _debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request.unverifiable and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + _debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + _debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + _debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + _debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + _debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + _debug(" domain %s is not in user allow-list", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host(request) + domain = cookie.domain + if self.strict_domain and (domain.count(".") >= 2): + # XXX This should probably be compared with the Konqueror + # (kcookiejar.cpp) and Mozilla implementations, but it's a + # losing battle. + i = domain.rfind(".") + j = domain.rfind(".", 0, i) + if j == 0: # domain like .foo.bar + tld = domain[i+1:] + sld = domain[j+1:i] + if sld.lower() in ("co", "ac", "com", "edu", "org", "net", + "gov", "mil", "int", "aero", "biz", "cat", "coop", + "info", "jobs", "mobi", "museum", "name", "pro", + "travel", "eu") and len(tld) == 2: + # domain like .co.uk + _debug(" country-code second level domain %s", domain) + return False + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + _debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + _debug(" effective request-host %s (even with added " + "initial dot) does not end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + _debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + _debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + _debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + _debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override .return_ok(), be sure to call this method. If it + returns false, so should your subclass (assuming your subclass wants to + be more strict about which cookies to return). + + """ + # Path has already been checked by .path_return_ok(), and domain + # blocking done by .domain_return_ok(). + _debug(" - checking cookie %s=%s", cookie.name, cookie.value) + + for n in "version", "verifiability", "secure", "expires", "port", "domain": + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + _debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + _debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request.unverifiable and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + _debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + _debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.type != "https": + _debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + _debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + _debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + _debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + _debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + _debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + req_host, erhn = eff_request_host(request) + if not req_host.startswith("."): + req_host = "."+req_host + if not erhn.startswith("."): + erhn = "."+erhn + if not (req_host.endswith(domain) or erhn.endswith(domain)): + #_debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + _debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + _debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + _debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + _debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = sorted(adict.keys()) + return map(adict.get, keys) + +def deepvalues(mapping): + """Iterates over nested mapping, depth-first, in sorted order by key.""" + values = vals_sorted_by_key(mapping) + for obj in values: + mapping = False + try: + obj.items + except AttributeError: + pass + else: + mapping = True + yield from deepvalues(obj) + if not mapping: + yield obj + + +# Used as second parameter to dict.get() method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try + urllib.request.build_opener(HTTPCookieProcessor).open(url). + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) + + def __init__(self, policy=None): + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies_lock = _threading.RLock() + self._cookies = {} + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + _debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + _debug(" not returning cookie") + continue + _debug(" it's a match") + cookies.append(cookie) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + like ['foo="bar"; $Path="/"', ...] + + The $Version attribute is also added when appropriate (currently only + once per request). + + """ + # add cookies in order of most specific (ie. longest) path first + cookies.sort(key=lambda a: len(a.path), reverse=True) + + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (urllib.request.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + """ + _debug("add_cookie_header") + self._cookies_lock.acquire() + try: + + self._policy._now = self._now = int(time.time()) + + cookies = self._cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header( + "Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if (self._policy.rfc2965 and not self._policy.hide_cookie2 and + not request.has_header("Cookie2")): + for cookie in cookies: + if cookie.version != 1: + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + finally: + self._cookies_lock.release() + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if k in standard: + # only first value is significant + continue + if k == "domain": + if v is None: + _debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + _debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + try: + v = int(v) + except ValueError: + _debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ("port", "comment", "commenturl")): + _debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: + try: + version = int(version) + except ValueError: + return None # invalid version, ignore cookie + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + elif expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(domain, path, name) + except KeyError: + pass + _debug("Expiring cookie, domain='%s', path='%s', name='%s'", + domain, path, name) + return None + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def _process_rfc2109_cookies(self, cookies): + rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) + if rfc2109_as_ns is None: + rfc2109_as_ns = not self._policy.rfc2965 + for cookie in cookies: + if cookie.version == 1: + cookie.rfc2109 = True + if rfc2109_as_ns: + # treat 2109 cookies as Netscape cookies rather than + # as RFC2965 cookies + cookie.version = 0 + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object.""" + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.get_all("Set-Cookie2", []) + ns_hdrs = headers.get_all("Set-Cookie", []) + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except Exception: + _warn_unhandled_exception() + cookies = [] + + if ns_hdrs and netscape: + try: + # RFC 2109 and Netscape cookies + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except Exception: + _warn_unhandled_exception() + ns_cookies = [] + self._process_rfc2109_cookies(ns_cookies) + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return key not in lookup + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so.""" + self._cookies_lock.acquire() + try: + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + + finally: + self._cookies_lock.release() + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set.""" + c = self._cookies + self._cookies_lock.acquire() + try: + if cookie.domain not in c: c[cookie.domain] = {} + c2 = c[cookie.domain] + if cookie.path not in c2: c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + finally: + self._cookies_lock.release() + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request.""" + _debug("extract_cookies: %s", response.info()) + self._cookies_lock.acquire() + try: + self._policy._now = self._now = int(time.time()) + + for cookie in self.make_cookies(response, request): + if self._policy.set_ok(cookie, request): + _debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + finally: + self._cookies_lock.release() + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Note that the .save() method won't save session cookies anyway, unless + you ask otherwise by passing a true ignore_discard argument. + + """ + self._cookies_lock.acquire() + try: + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + finally: + self._cookies_lock.release() + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the + .save() method won't save expired cookies anyway (unless you ask + otherwise by passing a true ignore_expires argument). + + """ + self._cookies_lock.acquire() + try: + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + finally: + self._cookies_lock.release() + + def __iter__(self): + return deepvalues(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) + + +# derives from OSError for backwards-compatibility with Python 2.4.0 +class LoadError(OSError): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file.""" + + def __init__(self, filename=None, delayload=False, policy=None): + """ + Cookies are NOT loaded from the named file until either the .load() or + .revert() method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None: + try: + filename+"" + except: + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file.""" + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + with open(filename) as f: + self._really_load(f, filename, ignore_discard, ignore_expires) + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or OSError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + self._cookies_lock.acquire() + try: + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except OSError: + self._cookies = old_state + raise + + finally: + self._cookies_lock.release() + + +def lwp_cookie_str(cookie): + """Return string representation of Cookie in the LWP cookie file format. + + Actually, the format is extended a bit -- see module docstring. + + """ + h = [(cookie.name, cookie.value), + ("path", cookie.path), + ("domain", cookie.domain)] + if cookie.port is not None: h.append(("port", cookie.port)) + if cookie.path_specified: h.append(("path_spec", None)) + if cookie.port_specified: h.append(("port_spec", None)) + if cookie.domain_initial_dot: h.append(("domain_dot", None)) + if cookie.secure: h.append(("secure", None)) + if cookie.expires: h.append(("expires", + time2isoz(float(cookie.expires)))) + if cookie.discard: h.append(("discard", None)) + if cookie.comment: h.append(("comment", cookie.comment)) + if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) + + keys = sorted(cookie._rest.keys()) + for k in keys: + h.append((k, str(cookie._rest[k]))) + + h.append(("version", str(cookie.version))) + + return join_header_words([h]) + +class LWPCookieJar(FileCookieJar): + """ + The LWPCookieJar saves a sequence of "Set-Cookie3" lines. + "Set-Cookie3" is the format used by the libwww-perl library, not known + to be compatible with any browser, but which is easy to read and + doesn't lose information about RFC 2965 cookies. + + Additional methods + + as_lwp_str(ignore_discard=True, ignore_expired=True) + + """ + + def as_lwp_str(self, ignore_discard=True, ignore_expires=True): + """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers. + + ignore_discard and ignore_expires: see docstring for FileCookieJar.save + + """ + now = time.time() + r = [] + for cookie in self: + if not ignore_discard and cookie.discard: + continue + if not ignore_expires and cookie.is_expired(now): + continue + r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) + return "\n".join(r+[""]) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + with open(filename, "w") as f: + # There really isn't an LWP Cookies 2.0 format, but this indicates + # that there is extra information in here (domain_dot and + # port_spec) while still being compatible with libwww-perl, I hope. + f.write("#LWP-Cookies-2.0\n") + f.write(self.as_lwp_str(ignore_discard, ignore_expires)) + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + magic = f.readline() + if not self.magic_re.search(magic): + msg = ("%r does not look like a Set-Cookie3 (LWP) format " + "file" % filename) + raise LoadError(msg) + + now = time.time() + + header = "Set-Cookie3:" + boolean_attrs = ("port_spec", "path_spec", "domain_dot", + "secure", "discard") + value_attrs = ("version", + "port", "path", "domain", + "expires", + "comment", "commenturl") + + try: + while 1: + line = f.readline() + if line == "": break + if not line.startswith(header): + continue + line = line[len(header):].strip() + + for data in split_header_words([line]): + name, value = data[0] + standard = {} + rest = {} + for k in boolean_attrs: + standard[k] = False + for k, v in data[1:]: + if k is not None: + lc = k.lower() + else: + lc = None + # don't lose case distinction for unknown fields + if (lc in value_attrs) or (lc in boolean_attrs): + k = lc + if k in boolean_attrs: + if v is None: v = True + standard[k] = v + elif k in value_attrs: + standard[k] = v + else: + rest[k] = v + + h = standard.get + expires = h("expires") + discard = h("discard") + if expires is not None: + expires = iso2time(expires) + if expires is None: + discard = True + domain = h("domain") + domain_specified = domain.startswith(".") + c = Cookie(h("version"), name, value, + h("port"), h("port_spec"), + domain, domain_specified, h("domain_dot"), + h("path"), h("path_spec"), + h("secure"), + expires, + discard, + h("comment"), + h("commenturl"), + rest) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + except OSError: + raise + except Exception: + _warn_unhandled_exception() + raise LoadError("invalid Set-Cookie3 format file %r: %r" % + (filename, line)) + + +class MozillaCookieJar(FileCookieJar): + """ + + WARNING: you may want to backup your browser's cookies file if you use + this class to save cookies. I *think* it works, but there have been + bugs in the past! + + This class differs from CookieJar only in the format it uses to save and + load cookies to and from a file. This class uses the Mozilla/Netscape + `cookies.txt' format. lynx uses this file format, too. + + Don't expect cookies saved while the browser is running to be noticed by + the browser (in fact, Mozilla on unix will overwrite your saved cookies if + you change them on disk while it's running; on Windows, you probably can't + save at all while the browser is running). + + Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to + Netscape cookies on saving. + + In particular, the cookie version and port number information is lost, + together with information about whether or not Path, Port and Discard were + specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the + domain as set in the HTTP header started with a dot (yes, I'm aware some + domains in Netscape files start with a dot and some don't -- trust me, you + really don't want to know any more about this). + + Note that though Mozilla and Netscape use the same format, they use + slightly different headers. The class saves cookies using the Netscape + header by default (Mozilla can cope with that). + + """ + magic_re = re.compile("#( Netscape)? HTTP Cookie File") + header = """\ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +""" + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + now = time.time() + + magic = f.readline() + if not self.magic_re.search(magic): + raise LoadError( + "%r does not look like a Netscape format cookies file" % + filename) + + try: + while 1: + line = f.readline() + if line == "": break + + # last field may be absent, so keep any trailing tab + if line.endswith("\n"): line = line[:-1] + + # skip comments and blank lines XXX what is $ for? + if (line.strip().startswith(("#", "$")) or + line.strip() == ""): + continue + + domain, domain_specified, path, secure, expires, name, value = \ + line.split("\t") + secure = (secure == "TRUE") + domain_specified = (domain_specified == "TRUE") + if name == "": + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name = value + value = None + + initial_dot = domain.startswith(".") + assert domain_specified == initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + # assume path_specified is false + c = Cookie(0, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + {}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + + except OSError: + raise + except Exception: + _warn_unhandled_exception() + raise LoadError("invalid Netscape format cookies file %r: %r" % + (filename, line)) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + with open(filename, "w") as f: + f.write(self.header) + now = time.time() + for cookie in self: + if not ignore_discard and cookie.discard: + continue + if not ignore_expires and cookie.is_expired(now): + continue + if cookie.secure: secure = "TRUE" + else: secure = "FALSE" + if cookie.domain.startswith("."): initial_dot = "TRUE" + else: initial_dot = "FALSE" + if cookie.expires is not None: + expires = str(cookie.expires) + else: + expires = "" + if cookie.value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name = "" + value = cookie.name + else: + name = cookie.name + value = cookie.value + f.write( + "\t".join([cookie.domain, initial_dot, cookie.path, + secure, expires, name, value])+ + "\n") diff --git a/Lib/http/cookies.py b/Lib/http/cookies.py new file mode 100644 index 0000000000..be3b080aa3 --- /dev/null +++ b/Lib/http/cookies.py @@ -0,0 +1,635 @@ +#### +# Copyright 2000 by Timothy O'Malley +# +# All Rights Reserved +# +# Permission to use, copy, modify, and distribute this software +# and its documentation for any purpose and without fee is hereby +# granted, provided that the above copyright notice appear in all +# copies and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Timothy O'Malley not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +# AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR +# ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. +# +#### +# +# Id: Cookie.py,v 2.29 2000/08/23 05:28:49 timo Exp +# by Timothy O'Malley +# +# Cookie.py is a Python module for the handling of HTTP +# cookies as a Python dictionary. See RFC 2109 for more +# information on cookies. +# +# The original idea to treat Cookies as a dictionary came from +# Dave Mitchell (davem@magnet.com) in 1995, when he released the +# first version of nscookie.py. +# +#### + +r""" +Here's a sample session to show how to use this module. +At the moment, this is the only documentation. + +The Basics +---------- + +Importing is easy... + + >>> from http import cookies + +Most of the time you start by creating a cookie. + + >>> C = cookies.SimpleCookie() + +Once you've created your Cookie, you can add values just as if it were +a dictionary. + + >>> C = cookies.SimpleCookie() + >>> C["fig"] = "newton" + >>> C["sugar"] = "wafer" + >>> C.output() + 'Set-Cookie: fig=newton\r\nSet-Cookie: sugar=wafer' + +Notice that the printable representation of a Cookie is the +appropriate format for a Set-Cookie: header. This is the +default behavior. You can change the header and printed +attributes by using the .output() function + + >>> C = cookies.SimpleCookie() + >>> C["rocky"] = "road" + >>> C["rocky"]["path"] = "/cookie" + >>> print(C.output(header="Cookie:")) + Cookie: rocky=road; Path=/cookie + >>> print(C.output(attrs=[], header="Cookie:")) + Cookie: rocky=road + +The load() method of a Cookie extracts cookies from a string. In a +CGI script, you would use this method to extract the cookies from the +HTTP_COOKIE environment variable. + + >>> C = cookies.SimpleCookie() + >>> C.load("chips=ahoy; vienna=finger") + >>> C.output() + 'Set-Cookie: chips=ahoy\r\nSet-Cookie: vienna=finger' + +The load() method is darn-tootin smart about identifying cookies +within a string. Escaped quotation marks, nested semicolons, and other +such trickeries do not confuse it. + + >>> C = cookies.SimpleCookie() + >>> C.load('keebler="E=everybody; L=\\"Loves\\"; fudge=\\012;";') + >>> print(C) + Set-Cookie: keebler="E=everybody; L=\"Loves\"; fudge=\012;" + +Each element of the Cookie also supports all of the RFC 2109 +Cookie attributes. Here's an example which sets the Path +attribute. + + >>> C = cookies.SimpleCookie() + >>> C["oreo"] = "doublestuff" + >>> C["oreo"]["path"] = "/" + >>> print(C) + Set-Cookie: oreo=doublestuff; Path=/ + +Each dictionary element has a 'value' attribute, which gives you +back the value associated with the key. + + >>> C = cookies.SimpleCookie() + >>> C["twix"] = "none for you" + >>> C["twix"].value + 'none for you' + +The SimpleCookie expects that all values should be standard strings. +Just to be sure, SimpleCookie invokes the str() builtin to convert +the value to a string, when the values are set dictionary-style. + + >>> C = cookies.SimpleCookie() + >>> C["number"] = 7 + >>> C["string"] = "seven" + >>> C["number"].value + '7' + >>> C["string"].value + 'seven' + >>> C.output() + 'Set-Cookie: number=7\r\nSet-Cookie: string=seven' + +Finis. +""" + +# +# Import our required modules +# +import re +import string + +__all__ = ["CookieError", "BaseCookie", "SimpleCookie"] + +_nulljoin = ''.join +_semispacejoin = '; '.join +_spacejoin = ' '.join + +def _warn_deprecated_setter(setter): + import warnings + msg = ('The .%s setter is deprecated. The attribute will be read-only in ' + 'future releases. Please use the set() method instead.' % setter) + warnings.warn(msg, DeprecationWarning, stacklevel=3) + +# +# Define an exception visible to External modules +# +class CookieError(Exception): + pass + + +# These quoting routines conform to the RFC2109 specification, which in +# turn references the character definitions from RFC2068. They provide +# a two-way quoting algorithm. Any non-text character is translated +# into a 4 character sequence: a forward-slash followed by the +# three-digit octal equivalent of the character. Any '\' or '"' is +# quoted with a preceding '\' slash. +# Because of the way browsers really handle cookies (as opposed to what +# the RFC says) we also encode "," and ";". +# +# These are taken from RFC2068 and RFC2109. +# _LegalChars is the list of chars which don't require "'s +# _Translator hash-table for fast quoting +# +_LegalChars = string.ascii_letters + string.digits + "!#$%&'*+-.^_`|~:" +_UnescapedChars = _LegalChars + ' ()/<=>?@[]{}' + +_Translator = {n: '\\%03o' % n + for n in set(range(256)) - set(map(ord, _UnescapedChars))} +_Translator.update({ + ord('"'): '\\"', + ord('\\'): '\\\\', +}) + +_is_legal_key = re.compile('[%s]+' % re.escape(_LegalChars)).fullmatch + +def _quote(str): + r"""Quote a string for use in a cookie header. + + If the string does not need to be double-quoted, then just return the + string. Otherwise, surround the string in doublequotes and quote + (with a \) special characters. + """ + if str is None or _is_legal_key(str): + return str + else: + return '"' + str.translate(_Translator) + '"' + + +_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]") +_QuotePatt = re.compile(r"[\\].") + +def _unquote(str): + # If there aren't any doublequotes, + # then there can't be any special characters. See RFC 2109. + if str is None or len(str) < 2: + return str + if str[0] != '"' or str[-1] != '"': + return str + + # We have to assume that we must decode this string. + # Down to work. + + # Remove the "s + str = str[1:-1] + + # Check for special sequences. Examples: + # \012 --> \n + # \" --> " + # + i = 0 + n = len(str) + res = [] + while 0 <= i < n: + o_match = _OctalPatt.search(str, i) + q_match = _QuotePatt.search(str, i) + if not o_match and not q_match: # Neither matched + res.append(str[i:]) + break + # else: + j = k = -1 + if o_match: + j = o_match.start(0) + if q_match: + k = q_match.start(0) + if q_match and (not o_match or k < j): # QuotePatt matched + res.append(str[i:k]) + res.append(str[k+1]) + i = k + 2 + else: # OctalPatt matched + res.append(str[i:j]) + res.append(chr(int(str[j+1:j+4], 8))) + i = j + 4 + return _nulljoin(res) + +# The _getdate() routine is used to set the expiration time in the cookie's HTTP +# header. By default, _getdate() returns the current time in the appropriate +# "expires" format for a Set-Cookie header. The one optional argument is an +# offset from now, in seconds. For example, an offset of -3600 means "one hour +# ago". The offset may be a floating point number. +# + +_weekdayname = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + +_monthname = [None, + 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', + 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + +def _getdate(future=0, weekdayname=_weekdayname, monthname=_monthname): + from time import gmtime, time + now = time() + year, month, day, hh, mm, ss, wd, y, z = gmtime(now + future) + return "%s, %02d %3s %4d %02d:%02d:%02d GMT" % \ + (weekdayname[wd], day, monthname[month], year, hh, mm, ss) + + +class Morsel(dict): + """A class to hold ONE (key, value) pair. + + In a cookie, each such pair may have several attributes, so this class is + used to keep the attributes associated with the appropriate key,value pair. + This class also includes a coded_value attribute, which is used to hold + the network representation of the value. This is most useful when Python + objects are pickled for network transit. + """ + # RFC 2109 lists these attributes as reserved: + # path comment domain + # max-age secure version + # + # For historical reasons, these attributes are also reserved: + # expires + # + # This is an extension from Microsoft: + # httponly + # + # This dictionary provides a mapping from the lowercase + # variant on the left to the appropriate traditional + # formatting on the right. + _reserved = { + "expires" : "expires", + "path" : "Path", + "comment" : "Comment", + "domain" : "Domain", + "max-age" : "Max-Age", + "secure" : "Secure", + "httponly" : "HttpOnly", + "version" : "Version", + } + + _flags = {'secure', 'httponly'} + + def __init__(self): + # Set defaults + self._key = self._value = self._coded_value = None + + # Set default attributes + for key in self._reserved: + dict.__setitem__(self, key, "") + + @property + def key(self): + return self._key + + @key.setter + def key(self, key): + _warn_deprecated_setter('key') + self._key = key + + @property + def value(self): + return self._value + + @value.setter + def value(self, value): + _warn_deprecated_setter('value') + self._value = value + + @property + def coded_value(self): + return self._coded_value + + @coded_value.setter + def coded_value(self, coded_value): + _warn_deprecated_setter('coded_value') + self._coded_value = coded_value + + def __setitem__(self, K, V): + K = K.lower() + if not K in self._reserved: + raise CookieError("Invalid attribute %r" % (K,)) + dict.__setitem__(self, K, V) + + def setdefault(self, key, val=None): + key = key.lower() + if key not in self._reserved: + raise CookieError("Invalid attribute %r" % (key,)) + return dict.setdefault(self, key, val) + + def __eq__(self, morsel): + if not isinstance(morsel, Morsel): + return NotImplemented + return (dict.__eq__(self, morsel) and + self._value == morsel._value and + self._key == morsel._key and + self._coded_value == morsel._coded_value) + + __ne__ = object.__ne__ + + def copy(self): + morsel = Morsel() + dict.update(morsel, self) + morsel.__dict__.update(self.__dict__) + return morsel + + def update(self, values): + data = {} + for key, val in dict(values).items(): + key = key.lower() + if key not in self._reserved: + raise CookieError("Invalid attribute %r" % (key,)) + data[key] = val + dict.update(self, data) + + def isReservedKey(self, K): + return K.lower() in self._reserved + + def set(self, key, val, coded_val, LegalChars=_LegalChars): + if LegalChars != _LegalChars: + import warnings + warnings.warn( + 'LegalChars parameter is deprecated, ignored and will ' + 'be removed in future versions.', DeprecationWarning, + stacklevel=2) + + if key.lower() in self._reserved: + raise CookieError('Attempt to set a reserved key %r' % (key,)) + if not _is_legal_key(key): + raise CookieError('Illegal key %r' % (key,)) + + # It's a good key, so save it. + self._key = key + self._value = val + self._coded_value = coded_val + + def __getstate__(self): + return { + 'key': self._key, + 'value': self._value, + 'coded_value': self._coded_value, + } + + def __setstate__(self, state): + self._key = state['key'] + self._value = state['value'] + self._coded_value = state['coded_value'] + + def output(self, attrs=None, header="Set-Cookie:"): + return "%s %s" % (header, self.OutputString(attrs)) + + __str__ = output + + def __repr__(self): + return '<%s: %s>' % (self.__class__.__name__, self.OutputString()) + + def js_output(self, attrs=None): + # Print javascript + return """ + + """ % (self.OutputString(attrs).replace('"', r'\"')) + + def OutputString(self, attrs=None): + # Build up our result + # + result = [] + append = result.append + + # First, the key=value pair + append("%s=%s" % (self.key, self.coded_value)) + + # Now add any defined attributes + if attrs is None: + attrs = self._reserved + items = sorted(self.items()) + for key, value in items: + if value == "": + continue + if key not in attrs: + continue + if key == "expires" and isinstance(value, int): + append("%s=%s" % (self._reserved[key], _getdate(value))) + elif key == "max-age" and isinstance(value, int): + append("%s=%d" % (self._reserved[key], value)) + elif key in self._flags: + if value: + append(str(self._reserved[key])) + else: + append("%s=%s" % (self._reserved[key], value)) + + # Return the result + return _semispacejoin(result) + + +# +# Pattern for finding cookie +# +# This used to be strict parsing based on the RFC2109 and RFC2068 +# specifications. I have since discovered that MSIE 3.0x doesn't +# follow the character rules outlined in those specs. As a +# result, the parsing rules here are less strict. +# + +_LegalKeyChars = r"\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=" +_LegalValueChars = _LegalKeyChars + r'\[\]' +_CookiePattern = re.compile(r""" + \s* # Optional whitespace at start of cookie + (?P # Start of group 'key' + [""" + _LegalKeyChars + r"""]+? # Any word of at least one letter + ) # End of group 'key' + ( # Optional group: there may not be a value. + \s*=\s* # Equal Sign + (?P # Start of group 'val' + "(?:[^\\"]|\\.)*" # Any doublequoted string + | # or + \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr + | # or + [""" + _LegalValueChars + r"""]* # Any word or empty string + ) # End of group 'val' + )? # End of optional value group + \s* # Any number of spaces. + (\s+|;|$) # Ending either at space, semicolon, or EOS. + """, re.ASCII | re.VERBOSE) # re.ASCII may be removed if safe. + + +# At long last, here is the cookie class. Using this class is almost just like +# using a dictionary. See this module's docstring for example usage. +# +class BaseCookie(dict): + """A container class for a set of Morsels.""" + + def value_decode(self, val): + """real_value, coded_value = value_decode(STRING) + Called prior to setting a cookie's value from the network + representation. The VALUE is the value read from HTTP + header. + Override this function to modify the behavior of cookies. + """ + return val, val + + def value_encode(self, val): + """real_value, coded_value = value_encode(VALUE) + Called prior to setting a cookie's value from the dictionary + representation. The VALUE is the value being assigned. + Override this function to modify the behavior of cookies. + """ + strval = str(val) + return strval, strval + + def __init__(self, input=None): + if input: + self.load(input) + + def __set(self, key, real_value, coded_value): + """Private method for setting a cookie's value""" + M = self.get(key, Morsel()) + M.set(key, real_value, coded_value) + dict.__setitem__(self, key, M) + + def __setitem__(self, key, value): + """Dictionary style assignment.""" + if isinstance(value, Morsel): + # allow assignment of constructed Morsels (e.g. for pickling) + dict.__setitem__(self, key, value) + else: + rval, cval = self.value_encode(value) + self.__set(key, rval, cval) + + def output(self, attrs=None, header="Set-Cookie:", sep="\015\012"): + """Return a string suitable for HTTP.""" + result = [] + items = sorted(self.items()) + for key, value in items: + result.append(value.output(attrs, header)) + return sep.join(result) + + __str__ = output + + def __repr__(self): + l = [] + items = sorted(self.items()) + for key, value in items: + l.append('%s=%s' % (key, repr(value.value))) + return '<%s: %s>' % (self.__class__.__name__, _spacejoin(l)) + + def js_output(self, attrs=None): + """Return a string suitable for JavaScript.""" + result = [] + items = sorted(self.items()) + for key, value in items: + result.append(value.js_output(attrs)) + return _nulljoin(result) + + def load(self, rawdata): + """Load cookies from a string (presumably HTTP_COOKIE) or + from a dictionary. Loading cookies from a dictionary 'd' + is equivalent to calling: + map(Cookie.__setitem__, d.keys(), d.values()) + """ + if isinstance(rawdata, str): + self.__parse_string(rawdata) + else: + # self.update() wouldn't call our custom __setitem__ + for key, value in rawdata.items(): + self[key] = value + return + + def __parse_string(self, str, patt=_CookiePattern): + i = 0 # Our starting point + n = len(str) # Length of string + parsed_items = [] # Parsed (type, key, value) triples + morsel_seen = False # A key=value pair was previously encountered + + TYPE_ATTRIBUTE = 1 + TYPE_KEYVALUE = 2 + + # We first parse the whole cookie string and reject it if it's + # syntactically invalid (this helps avoid some classes of injection + # attacks). + while 0 <= i < n: + # Start looking for a cookie + match = patt.match(str, i) + if not match: + # No more cookies + break + + key, value = match.group("key"), match.group("val") + i = match.end(0) + + if key[0] == "$": + if not morsel_seen: + # We ignore attributes which pertain to the cookie + # mechanism as a whole, such as "$Version". + # See RFC 2965. (Does anyone care?) + continue + parsed_items.append((TYPE_ATTRIBUTE, key[1:], value)) + elif key.lower() in Morsel._reserved: + if not morsel_seen: + # Invalid cookie string + return + if value is None: + if key.lower() in Morsel._flags: + parsed_items.append((TYPE_ATTRIBUTE, key, True)) + else: + # Invalid cookie string + return + else: + parsed_items.append((TYPE_ATTRIBUTE, key, _unquote(value))) + elif value is not None: + parsed_items.append((TYPE_KEYVALUE, key, self.value_decode(value))) + morsel_seen = True + else: + # Invalid cookie string + return + + # The cookie string is valid, apply it. + M = None # current morsel + for tp, key, value in parsed_items: + if tp == TYPE_ATTRIBUTE: + assert M is not None + M[key] = value + else: + assert tp == TYPE_KEYVALUE + rval, cval = value + self.__set(key, rval, cval) + M = self[key] + + +class SimpleCookie(BaseCookie): + """ + SimpleCookie supports strings as cookie values. When setting + the value using the dictionary assignment notation, SimpleCookie + calls the builtin str() to convert the value to a string. Values + received from HTTP are kept as strings. + """ + def value_decode(self, val): + return _unquote(val), val + + def value_encode(self, val): + strval = str(val) + return strval, _quote(strval) diff --git a/Lib/http/server.py b/Lib/http/server.py new file mode 100644 index 0000000000..e12e45bfc3 --- /dev/null +++ b/Lib/http/server.py @@ -0,0 +1,1211 @@ +"""HTTP server classes. + +Note: BaseHTTPRequestHandler doesn't implement any HTTP request; see +SimpleHTTPRequestHandler for simple implementations of GET, HEAD and POST, +and CGIHTTPRequestHandler for CGI scripts. + +It does, however, optionally implement HTTP/1.1 persistent connections, +as of version 0.3. + +Notes on CGIHTTPRequestHandler +------------------------------ + +This class implements GET and POST requests to cgi-bin scripts. + +If the os.fork() function is not present (e.g. on Windows), +subprocess.Popen() is used as a fallback, with slightly altered semantics. + +In all cases, the implementation is intentionally naive -- all +requests are executed synchronously. + +SECURITY WARNING: DON'T USE THIS CODE UNLESS YOU ARE INSIDE A FIREWALL +-- it may execute arbitrary Python code or external programs. + +Note that status code 200 is sent prior to execution of a CGI script, so +scripts cannot send other status codes such as 302 (redirect). + +XXX To do: + +- log requests even later (to capture byte count) +- log user-agent header and other interesting goodies +- send error log to separate file +""" + + +# See also: +# +# HTTP Working Group T. Berners-Lee +# INTERNET-DRAFT R. T. Fielding +# H. Frystyk Nielsen +# Expires September 8, 1995 March 8, 1995 +# +# URL: http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt +# +# and +# +# Network Working Group R. Fielding +# Request for Comments: 2616 et al +# Obsoletes: 2068 June 1999 +# Category: Standards Track +# +# URL: http://www.faqs.org/rfcs/rfc2616.html + +# Log files +# --------- +# +# Here's a quote from the NCSA httpd docs about log file format. +# +# | The logfile format is as follows. Each line consists of: +# | +# | host rfc931 authuser [DD/Mon/YYYY:hh:mm:ss] "request" ddd bbbb +# | +# | host: Either the DNS name or the IP number of the remote client +# | rfc931: Any information returned by identd for this person, +# | - otherwise. +# | authuser: If user sent a userid for authentication, the user name, +# | - otherwise. +# | DD: Day +# | Mon: Month (calendar name) +# | YYYY: Year +# | hh: hour (24-hour format, the machine's timezone) +# | mm: minutes +# | ss: seconds +# | request: The first line of the HTTP request as sent by the client. +# | ddd: the status code returned by the server, - if not available. +# | bbbb: the total number of bytes sent, +# | *not including the HTTP/1.0 header*, - if not available +# | +# | You can determine the name of the file accessed through request. +# +# (Actually, the latter is only true if you know the server configuration +# at the time the request was made!) + +__version__ = "0.6" + +__all__ = [ + "HTTPServer", "BaseHTTPRequestHandler", + "SimpleHTTPRequestHandler", "CGIHTTPRequestHandler", +] + +import email.utils +import html +import http.client +import io +import mimetypes +import os +import posixpath +import select +import shutil +import socket # For gethostbyaddr() +import socketserver +import sys +import time +import urllib.parse +import copy +import argparse + +from http import HTTPStatus + + +# Default error message template +DEFAULT_ERROR_MESSAGE = """\ + + + + + Error response + + +

Error response

+

Error code: %(code)d

+

Message: %(message)s.

+

Error code explanation: %(code)s - %(explain)s.

+ + +""" + +DEFAULT_ERROR_CONTENT_TYPE = "text/html;charset=utf-8" + +class HTTPServer(socketserver.TCPServer): + + allow_reuse_address = 1 # Seems to make sense in testing environment + + def server_bind(self): + """Override server_bind to store the server name.""" + socketserver.TCPServer.server_bind(self) + host, port = self.server_address[:2] + self.server_name = socket.getfqdn(host) + self.server_port = port + + +class BaseHTTPRequestHandler(socketserver.StreamRequestHandler): + + """HTTP request handler base class. + + The following explanation of HTTP serves to guide you through the + code as well as to expose any misunderstandings I may have about + HTTP (so you don't need to read the code to figure out I'm wrong + :-). + + HTTP (HyperText Transfer Protocol) is an extensible protocol on + top of a reliable stream transport (e.g. TCP/IP). The protocol + recognizes three parts to a request: + + 1. One line identifying the request type and path + 2. An optional set of RFC-822-style headers + 3. An optional data part + + The headers and data are separated by a blank line. + + The first line of the request has the form + + + + where is a (case-sensitive) keyword such as GET or POST, + is a string containing path information for the request, + and should be the string "HTTP/1.0" or "HTTP/1.1". + is encoded using the URL encoding scheme (using %xx to signify + the ASCII character with hex code xx). + + The specification specifies that lines are separated by CRLF but + for compatibility with the widest range of clients recommends + servers also handle LF. Similarly, whitespace in the request line + is treated sensibly (allowing multiple spaces between components + and allowing trailing whitespace). + + Similarly, for output, lines ought to be separated by CRLF pairs + but most clients grok LF characters just fine. + + If the first line of the request has the form + + + + (i.e. is left out) then this is assumed to be an HTTP + 0.9 request; this form has no optional headers and data part and + the reply consists of just the data. + + The reply form of the HTTP 1.x protocol again has three parts: + + 1. One line giving the response code + 2. An optional set of RFC-822-style headers + 3. The data + + Again, the headers and data are separated by a blank line. + + The response code line has the form + + + + where is the protocol version ("HTTP/1.0" or "HTTP/1.1"), + is a 3-digit response code indicating success or + failure of the request, and is an optional + human-readable string explaining what the response code means. + + This server parses the request and the headers, and then calls a + function specific to the request type (). Specifically, + a request SPAM will be handled by a method do_SPAM(). If no + such method exists the server sends an error response to the + client. If it exists, it is called with no arguments: + + do_SPAM() + + Note that the request name is case sensitive (i.e. SPAM and spam + are different requests). + + The various request details are stored in instance variables: + + - client_address is the client IP address in the form (host, + port); + + - command, path and version are the broken-down request line; + + - headers is an instance of email.message.Message (or a derived + class) containing the header information; + + - rfile is a file object open for reading positioned at the + start of the optional input data part; + + - wfile is a file object open for writing. + + IT IS IMPORTANT TO ADHERE TO THE PROTOCOL FOR WRITING! + + The first thing to be written must be the response line. Then + follow 0 or more header lines, then a blank line, and then the + actual data (if any). The meaning of the header lines depends on + the command executed by the server; in most cases, when data is + returned, there should be at least one header line of the form + + Content-type: / + + where and should be registered MIME types, + e.g. "text/html" or "text/plain". + + """ + + # The Python system version, truncated to its first component. + sys_version = "Python/" + sys.version.split()[0] + + # The server software version. You may want to override this. + # The format is multiple whitespace-separated strings, + # where each string is of the form name[/version]. + server_version = "BaseHTTP/" + __version__ + + error_message_format = DEFAULT_ERROR_MESSAGE + error_content_type = DEFAULT_ERROR_CONTENT_TYPE + + # The default request version. This only affects responses up until + # the point where the request line is parsed, so it mainly decides what + # the client gets back when sending a malformed request line. + # Most web servers default to HTTP 0.9, i.e. don't send a status line. + default_request_version = "HTTP/0.9" + + def parse_request(self): + """Parse a request (internal). + + The request should be stored in self.raw_requestline; the results + are in self.command, self.path, self.request_version and + self.headers. + + Return True for success, False for failure; on failure, an + error is sent back. + + """ + self.command = None # set in case of error on the first line + self.request_version = version = self.default_request_version + self.close_connection = True + requestline = str(self.raw_requestline, 'iso-8859-1') + requestline = requestline.rstrip('\r\n') + self.requestline = requestline + words = requestline.split() + if len(words) == 3: + command, path, version = words + try: + if version[:5] != 'HTTP/': + raise ValueError + base_version_number = version.split('/', 1)[1] + version_number = base_version_number.split(".") + # RFC 2145 section 3.1 says there can be only one "." and + # - major and minor numbers MUST be treated as + # separate integers; + # - HTTP/2.4 is a lower version than HTTP/2.13, which in + # turn is lower than HTTP/12.3; + # - Leading zeros MUST be ignored by recipients. + if len(version_number) != 2: + raise ValueError + version_number = int(version_number[0]), int(version_number[1]) + except (ValueError, IndexError): + self.send_error( + HTTPStatus.BAD_REQUEST, + "Bad request version (%r)" % version) + return False + if version_number >= (1, 1) and self.protocol_version >= "HTTP/1.1": + self.close_connection = False + if version_number >= (2, 0): + self.send_error( + HTTPStatus.HTTP_VERSION_NOT_SUPPORTED, + "Invalid HTTP version (%s)" % base_version_number) + return False + elif len(words) == 2: + command, path = words + self.close_connection = True + if command != 'GET': + self.send_error( + HTTPStatus.BAD_REQUEST, + "Bad HTTP/0.9 request type (%r)" % command) + return False + elif not words: + return False + else: + self.send_error( + HTTPStatus.BAD_REQUEST, + "Bad request syntax (%r)" % requestline) + return False + self.command, self.path, self.request_version = command, path, version + + # Examine the headers and look for a Connection directive. + try: + self.headers = http.client.parse_headers(self.rfile, + _class=self.MessageClass) + except http.client.LineTooLong as err: + self.send_error( + HTTPStatus.REQUEST_HEADER_FIELDS_TOO_LARGE, + "Line too long", + str(err)) + return False + except http.client.HTTPException as err: + self.send_error( + HTTPStatus.REQUEST_HEADER_FIELDS_TOO_LARGE, + "Too many headers", + str(err) + ) + return False + + conntype = self.headers.get('Connection', "") + if conntype.lower() == 'close': + self.close_connection = True + elif (conntype.lower() == 'keep-alive' and + self.protocol_version >= "HTTP/1.1"): + self.close_connection = False + # Examine the headers and look for an Expect directive + expect = self.headers.get('Expect', "") + if (expect.lower() == "100-continue" and + self.protocol_version >= "HTTP/1.1" and + self.request_version >= "HTTP/1.1"): + if not self.handle_expect_100(): + return False + return True + + def handle_expect_100(self): + """Decide what to do with an "Expect: 100-continue" header. + + If the client is expecting a 100 Continue response, we must + respond with either a 100 Continue or a final response before + waiting for the request body. The default is to always respond + with a 100 Continue. You can behave differently (for example, + reject unauthorized requests) by overriding this method. + + This method should either return True (possibly after sending + a 100 Continue response) or send an error response and return + False. + + """ + self.send_response_only(HTTPStatus.CONTINUE) + self.end_headers() + return True + + def handle_one_request(self): + """Handle a single HTTP request. + + You normally don't need to override this method; see the class + __doc__ string for information on how to handle specific HTTP + commands such as GET and POST. + + """ + try: + self.raw_requestline = self.rfile.readline(65537) + if len(self.raw_requestline) > 65536: + self.requestline = '' + self.request_version = '' + self.command = '' + self.send_error(HTTPStatus.REQUEST_URI_TOO_LONG) + return + if not self.raw_requestline: + self.close_connection = True + return + if not self.parse_request(): + # An error code has been sent, just exit + return + mname = 'do_' + self.command + if not hasattr(self, mname): + self.send_error( + HTTPStatus.NOT_IMPLEMENTED, + "Unsupported method (%r)" % self.command) + return + method = getattr(self, mname) + method() + self.wfile.flush() #actually send the response if not already done. + except socket.timeout as e: + #a read or a write timed out. Discard this connection + self.log_error("Request timed out: %r", e) + self.close_connection = True + return + + def handle(self): + """Handle multiple requests if necessary.""" + self.close_connection = True + + self.handle_one_request() + while not self.close_connection: + self.handle_one_request() + + def send_error(self, code, message=None, explain=None): + """Send and log an error reply. + + Arguments are + * code: an HTTP error code + 3 digits + * message: a simple optional 1 line reason phrase. + *( HTAB / SP / VCHAR / %x80-FF ) + defaults to short entry matching the response code + * explain: a detailed message defaults to the long entry + matching the response code. + + This sends an error response (so it must be called before any + output has been generated), logs the error, and finally sends + a piece of HTML explaining the error to the user. + + """ + + try: + shortmsg, longmsg = self.responses[code] + except KeyError: + shortmsg, longmsg = '???', '???' + if message is None: + message = shortmsg + if explain is None: + explain = longmsg + self.log_error("code %d, message %s", code, message) + self.send_response(code, message) + self.send_header('Connection', 'close') + + # Message body is omitted for cases described in: + # - RFC7230: 3.3. 1xx, 204(No Content), 304(Not Modified) + # - RFC7231: 6.3.6. 205(Reset Content) + body = None + if (code >= 200 and + code not in (HTTPStatus.NO_CONTENT, + HTTPStatus.RESET_CONTENT, + HTTPStatus.NOT_MODIFIED)): + # HTML encode to prevent Cross Site Scripting attacks + # (see bug #1100201) + content = (self.error_message_format % { + 'code': code, + 'message': html.escape(message, quote=False), + 'explain': html.escape(explain, quote=False) + }) + body = content.encode('UTF-8', 'replace') + self.send_header("Content-Type", self.error_content_type) + self.send_header('Content-Length', int(len(body))) + self.end_headers() + + if self.command != 'HEAD' and body: + self.wfile.write(body) + + def send_response(self, code, message=None): + """Add the response header to the headers buffer and log the + response code. + + Also send two standard headers with the server software + version and the current date. + + """ + self.log_request(code) + self.send_response_only(code, message) + self.send_header('Server', self.version_string()) + self.send_header('Date', self.date_time_string()) + + def send_response_only(self, code, message=None): + """Send the response header only.""" + if self.request_version != 'HTTP/0.9': + if message is None: + if code in self.responses: + message = self.responses[code][0] + else: + message = '' + if not hasattr(self, '_headers_buffer'): + self._headers_buffer = [] + self._headers_buffer.append(("%s %d %s\r\n" % + (self.protocol_version, code, message)).encode( + 'latin-1', 'strict')) + + def send_header(self, keyword, value): + """Send a MIME header to the headers buffer.""" + if self.request_version != 'HTTP/0.9': + if not hasattr(self, '_headers_buffer'): + self._headers_buffer = [] + self._headers_buffer.append( + ("%s: %s\r\n" % (keyword, value)).encode('latin-1', 'strict')) + + if keyword.lower() == 'connection': + if value.lower() == 'close': + self.close_connection = True + elif value.lower() == 'keep-alive': + self.close_connection = False + + def end_headers(self): + """Send the blank line ending the MIME headers.""" + if self.request_version != 'HTTP/0.9': + self._headers_buffer.append(b"\r\n") + self.flush_headers() + + def flush_headers(self): + if hasattr(self, '_headers_buffer'): + self.wfile.write(b"".join(self._headers_buffer)) + self._headers_buffer = [] + + def log_request(self, code='-', size='-'): + """Log an accepted request. + + This is called by send_response(). + + """ + if isinstance(code, HTTPStatus): + code = code.value + self.log_message('"%s" %s %s', + self.requestline, str(code), str(size)) + + def log_error(self, format, *args): + """Log an error. + + This is called when a request cannot be fulfilled. By + default it passes the message on to log_message(). + + Arguments are the same as for log_message(). + + XXX This should go to the separate error log. + + """ + + self.log_message(format, *args) + + def log_message(self, format, *args): + """Log an arbitrary message. + + This is used by all other logging functions. Override + it if you have specific logging wishes. + + The first argument, FORMAT, is a format string for the + message to be logged. If the format string contains + any % escapes requiring parameters, they should be + specified as subsequent arguments (it's just like + printf!). + + The client ip and current date/time are prefixed to + every message. + + """ + + sys.stderr.write("%s - - [%s] %s\n" % + (self.address_string(), + self.log_date_time_string(), + format%args)) + + def version_string(self): + """Return the server software version string.""" + return self.server_version + ' ' + self.sys_version + + def date_time_string(self, timestamp=None): + """Return the current date and time formatted for a message header.""" + if timestamp is None: + timestamp = time.time() + return email.utils.formatdate(timestamp, usegmt=True) + + def log_date_time_string(self): + """Return the current time formatted for logging.""" + now = time.time() + year, month, day, hh, mm, ss, x, y, z = time.localtime(now) + s = "%02d/%3s/%04d %02d:%02d:%02d" % ( + day, self.monthname[month], year, hh, mm, ss) + return s + + weekdayname = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + + monthname = [None, + 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', + 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + + def address_string(self): + """Return the client address.""" + + return self.client_address[0] + + # Essentially static class variables + + # The version of the HTTP protocol we support. + # Set this to HTTP/1.1 to enable automatic keepalive + protocol_version = "HTTP/1.0" + + # MessageClass used to parse headers + MessageClass = http.client.HTTPMessage + + # hack to maintain backwards compatibility + responses = { + v: (v.phrase, v.description) + for v in HTTPStatus.__members__.values() + } + + +class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): + + """Simple HTTP request handler with GET and HEAD commands. + + This serves files from the current directory and any of its + subdirectories. The MIME type for files is determined by + calling the .guess_type() method. + + The GET and HEAD requests are identical except that the HEAD + request omits the actual contents of the file. + + """ + + server_version = "SimpleHTTP/" + __version__ + + def do_GET(self): + """Serve a GET request.""" + f = self.send_head() + if f: + try: + self.copyfile(f, self.wfile) + finally: + f.close() + + def do_HEAD(self): + """Serve a HEAD request.""" + f = self.send_head() + if f: + f.close() + + def send_head(self): + """Common code for GET and HEAD commands. + + This sends the response code and MIME headers. + + Return value is either a file object (which has to be copied + to the outputfile by the caller unless the command was HEAD, + and must be closed by the caller under all circumstances), or + None, in which case the caller has nothing further to do. + + """ + path = self.translate_path(self.path) + f = None + if os.path.isdir(path): + parts = urllib.parse.urlsplit(self.path) + if not parts.path.endswith('/'): + # redirect browser - doing basically what apache does + self.send_response(HTTPStatus.MOVED_PERMANENTLY) + new_parts = (parts[0], parts[1], parts[2] + '/', + parts[3], parts[4]) + new_url = urllib.parse.urlunsplit(new_parts) + self.send_header("Location", new_url) + self.end_headers() + return None + for index in "index.html", "index.htm": + index = os.path.join(path, index) + if os.path.exists(index): + path = index + break + else: + return self.list_directory(path) + ctype = self.guess_type(path) + try: + f = open(path, 'rb') + except OSError: + self.send_error(HTTPStatus.NOT_FOUND, "File not found") + return None + try: + self.send_response(HTTPStatus.OK) + self.send_header("Content-type", ctype) + fs = os.fstat(f.fileno()) + self.send_header("Content-Length", str(fs[6])) + self.send_header("Last-Modified", self.date_time_string(fs.st_mtime)) + self.end_headers() + return f + except: + f.close() + raise + + def list_directory(self, path): + """Helper to produce a directory listing (absent index.html). + + Return value is either a file object, or None (indicating an + error). In either case, the headers are sent, making the + interface the same as for send_head(). + + """ + try: + list = os.listdir(path) + except OSError: + self.send_error( + HTTPStatus.NOT_FOUND, + "No permission to list directory") + return None + list.sort(key=lambda a: a.lower()) + r = [] + try: + displaypath = urllib.parse.unquote(self.path, + errors='surrogatepass') + except UnicodeDecodeError: + displaypath = urllib.parse.unquote(path) + displaypath = html.escape(displaypath, quote=False) + enc = sys.getfilesystemencoding() + title = 'Directory listing for %s' % displaypath + r.append('') + r.append('\n') + r.append('' % enc) + r.append('%s\n' % title) + r.append('\n

%s

' % title) + r.append('
\n
    ') + for name in list: + fullname = os.path.join(path, name) + displayname = linkname = name + # Append / for directories or @ for symbolic links + if os.path.isdir(fullname): + displayname = name + "/" + linkname = name + "/" + if os.path.islink(fullname): + displayname = name + "@" + # Note: a link to a directory displays with @ and links with / + r.append('
  • %s
  • ' + % (urllib.parse.quote(linkname, + errors='surrogatepass'), + html.escape(displayname, quote=False))) + r.append('
\n
\n\n\n') + encoded = '\n'.join(r).encode(enc, 'surrogateescape') + f = io.BytesIO() + f.write(encoded) + f.seek(0) + self.send_response(HTTPStatus.OK) + self.send_header("Content-type", "text/html; charset=%s" % enc) + self.send_header("Content-Length", str(len(encoded))) + self.end_headers() + return f + + def translate_path(self, path): + """Translate a /-separated PATH to the local filename syntax. + + Components that mean special things to the local file system + (e.g. drive or directory names) are ignored. (XXX They should + probably be diagnosed.) + + """ + # abandon query parameters + path = path.split('?',1)[0] + path = path.split('#',1)[0] + # Don't forget explicit trailing slash when normalizing. Issue17324 + trailing_slash = path.rstrip().endswith('/') + try: + path = urllib.parse.unquote(path, errors='surrogatepass') + except UnicodeDecodeError: + path = urllib.parse.unquote(path) + path = posixpath.normpath(path) + words = path.split('/') + words = filter(None, words) + path = os.getcwd() + for word in words: + if os.path.dirname(word) or word in (os.curdir, os.pardir): + # Ignore components that are not a simple file/directory name + continue + path = os.path.join(path, word) + if trailing_slash: + path += '/' + return path + + def copyfile(self, source, outputfile): + """Copy all data between two file objects. + + The SOURCE argument is a file object open for reading + (or anything with a read() method) and the DESTINATION + argument is a file object open for writing (or + anything with a write() method). + + The only reason for overriding this would be to change + the block size or perhaps to replace newlines by CRLF + -- note however that this the default server uses this + to copy binary data as well. + + """ + shutil.copyfileobj(source, outputfile) + + def guess_type(self, path): + """Guess the type of a file. + + Argument is a PATH (a filename). + + Return value is a string of the form type/subtype, + usable for a MIME Content-type header. + + The default implementation looks the file's extension + up in the table self.extensions_map, using application/octet-stream + as a default; however it would be permissible (if + slow) to look inside the data to make a better guess. + + """ + + base, ext = posixpath.splitext(path) + if ext in self.extensions_map: + return self.extensions_map[ext] + ext = ext.lower() + if ext in self.extensions_map: + return self.extensions_map[ext] + else: + return self.extensions_map[''] + + if not mimetypes.inited: + mimetypes.init() # try to read system mime.types + extensions_map = mimetypes.types_map.copy() + extensions_map.update({ + '': 'application/octet-stream', # Default + '.py': 'text/plain', + '.c': 'text/plain', + '.h': 'text/plain', + }) + + +# Utilities for CGIHTTPRequestHandler + +def _url_collapse_path(path): + """ + Given a URL path, remove extra '/'s and '.' path elements and collapse + any '..' references and returns a collapsed path. + + Implements something akin to RFC-2396 5.2 step 6 to parse relative paths. + The utility of this function is limited to is_cgi method and helps + preventing some security attacks. + + Returns: The reconstituted URL, which will always start with a '/'. + + Raises: IndexError if too many '..' occur within the path. + + """ + # Query component should not be involved. + path, _, query = path.partition('?') + path = urllib.parse.unquote(path) + + # Similar to os.path.split(os.path.normpath(path)) but specific to URL + # path semantics rather than local operating system semantics. + path_parts = path.split('/') + head_parts = [] + for part in path_parts[:-1]: + if part == '..': + head_parts.pop() # IndexError if more '..' than prior parts + elif part and part != '.': + head_parts.append( part ) + if path_parts: + tail_part = path_parts.pop() + if tail_part: + if tail_part == '..': + head_parts.pop() + tail_part = '' + elif tail_part == '.': + tail_part = '' + else: + tail_part = '' + + if query: + tail_part = '?'.join((tail_part, query)) + + splitpath = ('/' + '/'.join(head_parts), tail_part) + collapsed_path = "/".join(splitpath) + + return collapsed_path + + + +nobody = None + +def nobody_uid(): + """Internal routine to get nobody's uid""" + global nobody + if nobody: + return nobody + try: + import pwd + except ImportError: + return -1 + try: + nobody = pwd.getpwnam('nobody')[2] + except KeyError: + nobody = 1 + max(x[2] for x in pwd.getpwall()) + return nobody + + +def executable(path): + """Test for executable file.""" + return os.access(path, os.X_OK) + + +class CGIHTTPRequestHandler(SimpleHTTPRequestHandler): + + """Complete HTTP server with GET, HEAD and POST commands. + + GET and HEAD also support running CGI scripts. + + The POST command is *only* implemented for CGI scripts. + + """ + + # Determine platform specifics + have_fork = hasattr(os, 'fork') + + # Make rfile unbuffered -- we need to read one line and then pass + # the rest to a subprocess, so we can't use buffered input. + rbufsize = 0 + + def do_POST(self): + """Serve a POST request. + + This is only implemented for CGI scripts. + + """ + + if self.is_cgi(): + self.run_cgi() + else: + self.send_error( + HTTPStatus.NOT_IMPLEMENTED, + "Can only POST to CGI scripts") + + def send_head(self): + """Version of send_head that support CGI scripts""" + if self.is_cgi(): + return self.run_cgi() + else: + return SimpleHTTPRequestHandler.send_head(self) + + def is_cgi(self): + """Test whether self.path corresponds to a CGI script. + + Returns True and updates the cgi_info attribute to the tuple + (dir, rest) if self.path requires running a CGI script. + Returns False otherwise. + + If any exception is raised, the caller should assume that + self.path was rejected as invalid and act accordingly. + + The default implementation tests whether the normalized url + path begins with one of the strings in self.cgi_directories + (and the next character is a '/' or the end of the string). + + """ + collapsed_path = _url_collapse_path(self.path) + dir_sep = collapsed_path.find('/', 1) + head, tail = collapsed_path[:dir_sep], collapsed_path[dir_sep+1:] + if head in self.cgi_directories: + self.cgi_info = head, tail + return True + return False + + + cgi_directories = ['/cgi-bin', '/htbin'] + + def is_executable(self, path): + """Test whether argument path is an executable file.""" + return executable(path) + + def is_python(self, path): + """Test whether argument path is a Python script.""" + head, tail = os.path.splitext(path) + return tail.lower() in (".py", ".pyw") + + def run_cgi(self): + """Execute a CGI script.""" + dir, rest = self.cgi_info + path = dir + '/' + rest + i = path.find('/', len(dir)+1) + while i >= 0: + nextdir = path[:i] + nextrest = path[i+1:] + + scriptdir = self.translate_path(nextdir) + if os.path.isdir(scriptdir): + dir, rest = nextdir, nextrest + i = path.find('/', len(dir)+1) + else: + break + + # find an explicit query string, if present. + rest, _, query = rest.partition('?') + + # dissect the part after the directory name into a script name & + # a possible additional path, to be stored in PATH_INFO. + i = rest.find('/') + if i >= 0: + script, rest = rest[:i], rest[i:] + else: + script, rest = rest, '' + + scriptname = dir + '/' + script + scriptfile = self.translate_path(scriptname) + if not os.path.exists(scriptfile): + self.send_error( + HTTPStatus.NOT_FOUND, + "No such CGI script (%r)" % scriptname) + return + if not os.path.isfile(scriptfile): + self.send_error( + HTTPStatus.FORBIDDEN, + "CGI script is not a plain file (%r)" % scriptname) + return + ispy = self.is_python(scriptname) + if self.have_fork or not ispy: + if not self.is_executable(scriptfile): + self.send_error( + HTTPStatus.FORBIDDEN, + "CGI script is not executable (%r)" % scriptname) + return + + # Reference: http://hoohoo.ncsa.uiuc.edu/cgi/env.html + # XXX Much of the following could be prepared ahead of time! + env = copy.deepcopy(os.environ) + env['SERVER_SOFTWARE'] = self.version_string() + env['SERVER_NAME'] = self.server.server_name + env['GATEWAY_INTERFACE'] = 'CGI/1.1' + env['SERVER_PROTOCOL'] = self.protocol_version + env['SERVER_PORT'] = str(self.server.server_port) + env['REQUEST_METHOD'] = self.command + uqrest = urllib.parse.unquote(rest) + env['PATH_INFO'] = uqrest + env['PATH_TRANSLATED'] = self.translate_path(uqrest) + env['SCRIPT_NAME'] = scriptname + if query: + env['QUERY_STRING'] = query + env['REMOTE_ADDR'] = self.client_address[0] + authorization = self.headers.get("authorization") + if authorization: + authorization = authorization.split() + if len(authorization) == 2: + import base64, binascii + env['AUTH_TYPE'] = authorization[0] + if authorization[0].lower() == "basic": + try: + authorization = authorization[1].encode('ascii') + authorization = base64.decodebytes(authorization).\ + decode('ascii') + except (binascii.Error, UnicodeError): + pass + else: + authorization = authorization.split(':') + if len(authorization) == 2: + env['REMOTE_USER'] = authorization[0] + # XXX REMOTE_IDENT + if self.headers.get('content-type') is None: + env['CONTENT_TYPE'] = self.headers.get_content_type() + else: + env['CONTENT_TYPE'] = self.headers['content-type'] + length = self.headers.get('content-length') + if length: + env['CONTENT_LENGTH'] = length + referer = self.headers.get('referer') + if referer: + env['HTTP_REFERER'] = referer + accept = [] + for line in self.headers.getallmatchingheaders('accept'): + if line[:1] in "\t\n\r ": + accept.append(line.strip()) + else: + accept = accept + line[7:].split(',') + env['HTTP_ACCEPT'] = ','.join(accept) + ua = self.headers.get('user-agent') + if ua: + env['HTTP_USER_AGENT'] = ua + co = filter(None, self.headers.get_all('cookie', [])) + cookie_str = ', '.join(co) + if cookie_str: + env['HTTP_COOKIE'] = cookie_str + # XXX Other HTTP_* headers + # Since we're setting the env in the parent, provide empty + # values to override previously set values + for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH', + 'HTTP_USER_AGENT', 'HTTP_COOKIE', 'HTTP_REFERER'): + env.setdefault(k, "") + + self.send_response(HTTPStatus.OK, "Script output follows") + self.flush_headers() + + decoded_query = query.replace('+', ' ') + + if self.have_fork: + # Unix -- fork as we should + args = [script] + if '=' not in decoded_query: + args.append(decoded_query) + nobody = nobody_uid() + self.wfile.flush() # Always flush before forking + pid = os.fork() + if pid != 0: + # Parent + pid, sts = os.waitpid(pid, 0) + # throw away additional data [see bug #427345] + while select.select([self.rfile], [], [], 0)[0]: + if not self.rfile.read(1): + break + if sts: + self.log_error("CGI script exit status %#x", sts) + return + # Child + try: + try: + os.setuid(nobody) + except OSError: + pass + os.dup2(self.rfile.fileno(), 0) + os.dup2(self.wfile.fileno(), 1) + os.execve(scriptfile, args, env) + except: + self.server.handle_error(self.request, self.client_address) + os._exit(127) + + else: + # Non-Unix -- use subprocess + import subprocess + cmdline = [scriptfile] + if self.is_python(scriptfile): + interp = sys.executable + if interp.lower().endswith("w.exe"): + # On Windows, use python.exe, not pythonw.exe + interp = interp[:-5] + interp[-4:] + cmdline = [interp, '-u'] + cmdline + if '=' not in query: + cmdline.append(query) + self.log_message("command: %s", subprocess.list2cmdline(cmdline)) + try: + nbytes = int(length) + except (TypeError, ValueError): + nbytes = 0 + p = subprocess.Popen(cmdline, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env = env + ) + if self.command.lower() == "post" and nbytes > 0: + data = self.rfile.read(nbytes) + else: + data = None + # throw away additional data [see bug #427345] + while select.select([self.rfile._sock], [], [], 0)[0]: + if not self.rfile._sock.recv(1): + break + stdout, stderr = p.communicate(data) + self.wfile.write(stdout) + if stderr: + self.log_error('%s', stderr) + p.stderr.close() + p.stdout.close() + status = p.returncode + if status: + self.log_error("CGI script exit status %#x", status) + else: + self.log_message("CGI script exited OK") + + +def test(HandlerClass=BaseHTTPRequestHandler, + ServerClass=HTTPServer, protocol="HTTP/1.0", port=8000, bind=""): + """Test the HTTP request handler class. + + This runs an HTTP server on port 8000 (or the port argument). + + """ + server_address = (bind, port) + + HandlerClass.protocol_version = protocol + with ServerClass(server_address, HandlerClass) as httpd: + sa = httpd.socket.getsockname() + serve_message = "Serving HTTP on {host} port {port} (http://{host}:{port}/) ..." + print(serve_message.format(host=sa[0], port=sa[1])) + try: + httpd.serve_forever() + except KeyboardInterrupt: + print("\nKeyboard interrupt received, exiting.") + sys.exit(0) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--cgi', action='store_true', + help='Run as CGI Server') + parser.add_argument('--bind', '-b', default='', metavar='ADDRESS', + help='Specify alternate bind address ' + '[default: all interfaces]') + parser.add_argument('port', action='store', + default=8000, type=int, + nargs='?', + help='Specify alternate port [default: 8000]') + args = parser.parse_args() + if args.cgi: + handler_class = CGIHTTPRequestHandler + else: + handler_class = SimpleHTTPRequestHandler + test(HandlerClass=handler_class, port=args.port, bind=args.bind) diff --git a/Lib/mimetypes.py b/Lib/mimetypes.py new file mode 100644 index 0000000000..9a886803dc --- /dev/null +++ b/Lib/mimetypes.py @@ -0,0 +1,597 @@ +"""Guess the MIME type of a file. + +This module defines two useful functions: + +guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. + +guess_extension(type, strict=True) -- guess the extension for a given MIME type. + +It also contains the following, for tuning the behavior: + +Data: + +knownfiles -- list of files to parse +inited -- flag set when init() has been called +suffix_map -- dictionary mapping suffixes to suffixes +encodings_map -- dictionary mapping suffixes to encodings +types_map -- dictionary mapping suffixes to types + +Functions: + +init([files]) -- parse a list of files, default knownfiles (on Windows, the + default values are taken from the registry) +read_mime_types(file) -- parse one file, return a dictionary or None +""" + +import os +import sys +import posixpath +import urllib.parse +try: + import winreg as _winreg +except ImportError: + _winreg = None + +__all__ = [ + "knownfiles", "inited", "MimeTypes", + "guess_type", "guess_all_extensions", "guess_extension", + "add_type", "init", "read_mime_types", + "suffix_map", "encodings_map", "types_map", "common_types" +] + +knownfiles = [ + "/etc/mime.types", + "/etc/httpd/mime.types", # Mac OS X + "/etc/httpd/conf/mime.types", # Apache + "/etc/apache/mime.types", # Apache 1 + "/etc/apache2/mime.types", # Apache 2 + "/usr/local/etc/httpd/conf/mime.types", + "/usr/local/lib/netscape/mime.types", + "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 + "/usr/local/etc/mime.types", # Apache 1.3 + ] + +inited = False +_db = None + + +class MimeTypes: + """MIME-types datastore. + + This datastore can handle information from mime.types-style files + and supports basic determination of MIME type from a filename or + URL, and can guess a reasonable extension given a MIME type. + """ + + def __init__(self, filenames=(), strict=True): + if not inited: + init() + self.encodings_map = encodings_map.copy() + self.suffix_map = suffix_map.copy() + self.types_map = ({}, {}) # dict for (non-strict, strict) + self.types_map_inv = ({}, {}) + for (ext, type) in types_map.items(): + self.add_type(type, ext, True) + for (ext, type) in common_types.items(): + self.add_type(type, ext, False) + for name in filenames: + self.read(name, strict) + + def add_type(self, type, ext, strict=True): + """Add a mapping between a type and an extension. + + When the extension is already known, the new + type will replace the old one. When the type + is already known the extension will be added + to the list of known extensions. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + self.types_map[strict][ext] = type + exts = self.types_map_inv[strict].setdefault(type, []) + if ext not in exts: + exts.append(ext) + + def guess_type(self, url, strict=True): + """Guess the type of a file based on its URL. + + Return value is a tuple (type, encoding) where type is None if + the type can't be guessed (no or unknown suffix) or a string + of the form type/subtype, usable for a MIME Content-type + header; and encoding is None for no encoding or the name of + the program used to encode (e.g. compress or gzip). The + mappings are table driven. Encoding suffixes are case + sensitive; type suffixes are first tried case sensitive, then + case insensitive. + + The suffixes .tgz, .taz and .tz (case sensitive!) are all + mapped to '.tar.gz'. (This is table-driven too, using the + dictionary suffix_map.) + + Optional `strict' argument when False adds a bunch of commonly found, + but non-standard types. + """ + scheme, url = urllib.parse.splittype(url) + if scheme == 'data': + # syntax of data URLs: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + # type/subtype defaults to "text/plain" + comma = url.find(',') + if comma < 0: + # bad data URL + return None, None + semi = url.find(';', 0, comma) + if semi >= 0: + type = url[:semi] + else: + type = url[:comma] + if '=' in type or '/' not in type: + type = 'text/plain' + return type, None # never compressed, so encoding is None + base, ext = posixpath.splitext(url) + while ext in self.suffix_map: + base, ext = posixpath.splitext(base + self.suffix_map[ext]) + if ext in self.encodings_map: + encoding = self.encodings_map[ext] + base, ext = posixpath.splitext(base) + else: + encoding = None + types_map = self.types_map[True] + if ext in types_map: + return types_map[ext], encoding + elif ext.lower() in types_map: + return types_map[ext.lower()], encoding + elif strict: + return None, encoding + types_map = self.types_map[False] + if ext in types_map: + return types_map[ext], encoding + elif ext.lower() in types_map: + return types_map[ext.lower()], encoding + else: + return None, encoding + + def guess_all_extensions(self, type, strict=True): + """Guess the extensions for a file based on its MIME type. + + Return value is a list of strings giving the possible filename + extensions, including the leading dot ('.'). The extension is not + guaranteed to have been associated with any particular data stream, + but would be mapped to the MIME type `type' by guess_type(). + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + type = type.lower() + extensions = self.types_map_inv[True].get(type, []) + if not strict: + for ext in self.types_map_inv[False].get(type, []): + if ext not in extensions: + extensions.append(ext) + return extensions + + def guess_extension(self, type, strict=True): + """Guess the extension for a file based on its MIME type. + + Return value is a string giving a filename extension, + including the leading dot ('.'). The extension is not + guaranteed to have been associated with any particular data + stream, but would be mapped to the MIME type `type' by + guess_type(). If no extension can be guessed for `type', None + is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + extensions = self.guess_all_extensions(type, strict) + if not extensions: + return None + return extensions[0] + + def read(self, filename, strict=True): + """ + Read a single mime.types-format file, specified by pathname. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + with open(filename, encoding='utf-8') as fp: + self.readfp(fp, strict) + + def readfp(self, fp, strict=True): + """ + Read a single mime.types-format file. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + while 1: + line = fp.readline() + if not line: + break + words = line.split() + for i in range(len(words)): + if words[i][0] == '#': + del words[i:] + break + if not words: + continue + type, suffixes = words[0], words[1:] + for suff in suffixes: + self.add_type(type, '.' + suff, strict) + + def read_windows_registry(self, strict=True): + """ + Load the MIME types database from Windows registry. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + + # Windows only + if not _winreg: + return + + def enum_types(mimedb): + i = 0 + while True: + try: + ctype = _winreg.EnumKey(mimedb, i) + except EnvironmentError: + break + else: + if '\0' not in ctype: + yield ctype + i += 1 + + with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: + for subkeyname in enum_types(hkcr): + try: + with _winreg.OpenKey(hkcr, subkeyname) as subkey: + # Only check file extensions + if not subkeyname.startswith("."): + continue + # raises EnvironmentError if no 'Content Type' value + mimetype, datatype = _winreg.QueryValueEx( + subkey, 'Content Type') + if datatype != _winreg.REG_SZ: + continue + self.add_type(mimetype, subkeyname, strict) + except EnvironmentError: + continue + +def guess_type(url, strict=True): + """Guess the type of a file based on its URL. + + Return value is a tuple (type, encoding) where type is None if the + type can't be guessed (no or unknown suffix) or a string of the + form type/subtype, usable for a MIME Content-type header; and + encoding is None for no encoding or the name of the program used + to encode (e.g. compress or gzip). The mappings are table + driven. Encoding suffixes are case sensitive; type suffixes are + first tried case sensitive, then case insensitive. + + The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped + to ".tar.gz". (This is table-driven too, using the dictionary + suffix_map). + + Optional `strict' argument when false adds a bunch of commonly found, but + non-standard types. + """ + if _db is None: + init() + return _db.guess_type(url, strict) + + +def guess_all_extensions(type, strict=True): + """Guess the extensions for a file based on its MIME type. + + Return value is a list of strings giving the possible filename + extensions, including the leading dot ('.'). The extension is not + guaranteed to have been associated with any particular data + stream, but would be mapped to the MIME type `type' by + guess_type(). If no extension can be guessed for `type', None + is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + if _db is None: + init() + return _db.guess_all_extensions(type, strict) + +def guess_extension(type, strict=True): + """Guess the extension for a file based on its MIME type. + + Return value is a string giving a filename extension, including the + leading dot ('.'). The extension is not guaranteed to have been + associated with any particular data stream, but would be mapped to the + MIME type `type' by guess_type(). If no extension can be guessed for + `type', None is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + if _db is None: + init() + return _db.guess_extension(type, strict) + +def add_type(type, ext, strict=True): + """Add a mapping between a type and an extension. + + When the extension is already known, the new + type will replace the old one. When the type + is already known the extension will be added + to the list of known extensions. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + if _db is None: + init() + return _db.add_type(type, ext, strict) + + +def init(files=None): + global suffix_map, types_map, encodings_map, common_types + global inited, _db + inited = True # so that MimeTypes.__init__() doesn't call us again + db = MimeTypes() + if files is None: + if _winreg: + db.read_windows_registry() + files = knownfiles + for file in files: + if os.path.isfile(file): + db.read(file) + encodings_map = db.encodings_map + suffix_map = db.suffix_map + types_map = db.types_map[True] + common_types = db.types_map[False] + # Make the DB a global variable now that it is fully initialized + _db = db + + +def read_mime_types(file): + try: + f = open(file) + except OSError: + return None + with f: + db = MimeTypes() + db.readfp(f, True) + return db.types_map[True] + + +def _default_mime_types(): + global suffix_map + global encodings_map + global types_map + global common_types + + suffix_map = { + '.svgz': '.svg.gz', + '.tgz': '.tar.gz', + '.taz': '.tar.gz', + '.tz': '.tar.gz', + '.tbz2': '.tar.bz2', + '.txz': '.tar.xz', + } + + encodings_map = { + '.gz': 'gzip', + '.Z': 'compress', + '.bz2': 'bzip2', + '.xz': 'xz', + } + + # Before adding new types, make sure they are either registered with IANA, + # at http://www.iana.org/assignments/media-types + # or extensions, i.e. using the x- prefix + + # If you add to these, please keep them sorted! + types_map = { + '.a' : 'application/octet-stream', + '.ai' : 'application/postscript', + '.aif' : 'audio/x-aiff', + '.aifc' : 'audio/x-aiff', + '.aiff' : 'audio/x-aiff', + '.au' : 'audio/basic', + '.avi' : 'video/x-msvideo', + '.bat' : 'text/plain', + '.bcpio' : 'application/x-bcpio', + '.bin' : 'application/octet-stream', + '.bmp' : 'image/x-ms-bmp', + '.c' : 'text/plain', + # Duplicates :( + '.cdf' : 'application/x-cdf', + '.cdf' : 'application/x-netcdf', + '.cpio' : 'application/x-cpio', + '.csh' : 'application/x-csh', + '.css' : 'text/css', + '.csv' : 'text/csv', + '.dll' : 'application/octet-stream', + '.doc' : 'application/msword', + '.dot' : 'application/msword', + '.dvi' : 'application/x-dvi', + '.eml' : 'message/rfc822', + '.eps' : 'application/postscript', + '.etx' : 'text/x-setext', + '.exe' : 'application/octet-stream', + '.gif' : 'image/gif', + '.gtar' : 'application/x-gtar', + '.h' : 'text/plain', + '.hdf' : 'application/x-hdf', + '.htm' : 'text/html', + '.html' : 'text/html', + '.ico' : 'image/vnd.microsoft.icon', + '.ief' : 'image/ief', + '.jpe' : 'image/jpeg', + '.jpeg' : 'image/jpeg', + '.jpg' : 'image/jpeg', + '.js' : 'application/javascript', + '.ksh' : 'text/plain', + '.latex' : 'application/x-latex', + '.m1v' : 'video/mpeg', + '.m3u' : 'application/vnd.apple.mpegurl', + '.m3u8' : 'application/vnd.apple.mpegurl', + '.man' : 'application/x-troff-man', + '.me' : 'application/x-troff-me', + '.mht' : 'message/rfc822', + '.mhtml' : 'message/rfc822', + '.mif' : 'application/x-mif', + '.mov' : 'video/quicktime', + '.movie' : 'video/x-sgi-movie', + '.mp2' : 'audio/mpeg', + '.mp3' : 'audio/mpeg', + '.mp4' : 'video/mp4', + '.mpa' : 'video/mpeg', + '.mpe' : 'video/mpeg', + '.mpeg' : 'video/mpeg', + '.mpg' : 'video/mpeg', + '.ms' : 'application/x-troff-ms', + '.nc' : 'application/x-netcdf', + '.nws' : 'message/rfc822', + '.o' : 'application/octet-stream', + '.obj' : 'application/octet-stream', + '.oda' : 'application/oda', + '.p12' : 'application/x-pkcs12', + '.p7c' : 'application/pkcs7-mime', + '.pbm' : 'image/x-portable-bitmap', + '.pdf' : 'application/pdf', + '.pfx' : 'application/x-pkcs12', + '.pgm' : 'image/x-portable-graymap', + '.pl' : 'text/plain', + '.png' : 'image/png', + '.pnm' : 'image/x-portable-anymap', + '.pot' : 'application/vnd.ms-powerpoint', + '.ppa' : 'application/vnd.ms-powerpoint', + '.ppm' : 'image/x-portable-pixmap', + '.pps' : 'application/vnd.ms-powerpoint', + '.ppt' : 'application/vnd.ms-powerpoint', + '.ps' : 'application/postscript', + '.pwz' : 'application/vnd.ms-powerpoint', + '.py' : 'text/x-python', + '.pyc' : 'application/x-python-code', + '.pyo' : 'application/x-python-code', + '.qt' : 'video/quicktime', + '.ra' : 'audio/x-pn-realaudio', + '.ram' : 'application/x-pn-realaudio', + '.ras' : 'image/x-cmu-raster', + '.rdf' : 'application/xml', + '.rgb' : 'image/x-rgb', + '.roff' : 'application/x-troff', + '.rtx' : 'text/richtext', + '.sgm' : 'text/x-sgml', + '.sgml' : 'text/x-sgml', + '.sh' : 'application/x-sh', + '.shar' : 'application/x-shar', + '.snd' : 'audio/basic', + '.so' : 'application/octet-stream', + '.src' : 'application/x-wais-source', + '.sv4cpio': 'application/x-sv4cpio', + '.sv4crc' : 'application/x-sv4crc', + '.svg' : 'image/svg+xml', + '.swf' : 'application/x-shockwave-flash', + '.t' : 'application/x-troff', + '.tar' : 'application/x-tar', + '.tcl' : 'application/x-tcl', + '.tex' : 'application/x-tex', + '.texi' : 'application/x-texinfo', + '.texinfo': 'application/x-texinfo', + '.tif' : 'image/tiff', + '.tiff' : 'image/tiff', + '.tr' : 'application/x-troff', + '.tsv' : 'text/tab-separated-values', + '.txt' : 'text/plain', + '.ustar' : 'application/x-ustar', + '.vcf' : 'text/x-vcard', + '.wav' : 'audio/x-wav', + '.webm' : 'video/webm', + '.wiz' : 'application/msword', + '.wsdl' : 'application/xml', + '.xbm' : 'image/x-xbitmap', + '.xlb' : 'application/vnd.ms-excel', + # Duplicates :( + '.xls' : 'application/excel', + '.xls' : 'application/vnd.ms-excel', + '.xml' : 'text/xml', + '.xpdl' : 'application/xml', + '.xpm' : 'image/x-xpixmap', + '.xsl' : 'application/xml', + '.xwd' : 'image/x-xwindowdump', + '.zip' : 'application/zip', + } + + # These are non-standard types, commonly found in the wild. They will + # only match if strict=0 flag is given to the API methods. + + # Please sort these too + common_types = { + '.jpg' : 'image/jpg', + '.mid' : 'audio/midi', + '.midi': 'audio/midi', + '.pct' : 'image/pict', + '.pic' : 'image/pict', + '.pict': 'image/pict', + '.rtf' : 'application/rtf', + '.xul' : 'text/xul' + } + + +_default_mime_types() + + +if __name__ == '__main__': + import getopt + + USAGE = """\ +Usage: mimetypes.py [options] type + +Options: + --help / -h -- print this message and exit + --lenient / -l -- additionally search of some common, but non-standard + types. + --extension / -e -- guess extension instead of type + +More than one type argument may be given. +""" + + def usage(code, msg=''): + print(USAGE) + if msg: print(msg) + sys.exit(code) + + try: + opts, args = getopt.getopt(sys.argv[1:], 'hle', + ['help', 'lenient', 'extension']) + except getopt.error as msg: + usage(1, msg) + + strict = 1 + extension = 0 + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-l', '--lenient'): + strict = 0 + elif opt in ('-e', '--extension'): + extension = 1 + for gtype in args: + if extension: + guess = guess_extension(gtype, strict) + if not guess: print("I don't know anything about type", gtype) + else: print(guess) + else: + guess, encoding = guess_type(gtype, strict) + if not guess: print("I don't know anything about type", gtype) + else: print('type:', guess, 'encoding:', encoding) diff --git a/Lib/socketserver.py b/Lib/socketserver.py new file mode 100644 index 0000000000..41a3766772 --- /dev/null +++ b/Lib/socketserver.py @@ -0,0 +1,793 @@ +"""Generic socket server classes. + +This module tries to capture the various aspects of defining a server: + +For socket-based servers: + +- address family: + - AF_INET{,6}: IP (Internet Protocol) sockets (default) + - AF_UNIX: Unix domain sockets + - others, e.g. AF_DECNET are conceivable (see +- socket type: + - SOCK_STREAM (reliable stream, e.g. TCP) + - SOCK_DGRAM (datagrams, e.g. UDP) + +For request-based servers (including socket-based): + +- client address verification before further looking at the request + (This is actually a hook for any processing that needs to look + at the request before anything else, e.g. logging) +- how to handle multiple requests: + - synchronous (one request is handled at a time) + - forking (each request is handled by a new process) + - threading (each request is handled by a new thread) + +The classes in this module favor the server type that is simplest to +write: a synchronous TCP/IP server. This is bad class design, but +save some typing. (There's also the issue that a deep class hierarchy +slows down method lookups.) + +There are five classes in an inheritance diagram, four of which represent +synchronous servers of four types: + + +------------+ + | BaseServer | + +------------+ + | + v + +-----------+ +------------------+ + | TCPServer |------->| UnixStreamServer | + +-----------+ +------------------+ + | + v + +-----------+ +--------------------+ + | UDPServer |------->| UnixDatagramServer | + +-----------+ +--------------------+ + +Note that UnixDatagramServer derives from UDPServer, not from +UnixStreamServer -- the only difference between an IP and a Unix +stream server is the address family, which is simply repeated in both +unix server classes. + +Forking and threading versions of each type of server can be created +using the ForkingMixIn and ThreadingMixIn mix-in classes. For +instance, a threading UDP server class is created as follows: + + class ThreadingUDPServer(ThreadingMixIn, UDPServer): pass + +The Mix-in class must come first, since it overrides a method defined +in UDPServer! Setting the various member variables also changes +the behavior of the underlying server mechanism. + +To implement a service, you must derive a class from +BaseRequestHandler and redefine its handle() method. You can then run +various versions of the service by combining one of the server classes +with your request handler class. + +The request handler class must be different for datagram or stream +services. This can be hidden by using the request handler +subclasses StreamRequestHandler or DatagramRequestHandler. + +Of course, you still have to use your head! + +For instance, it makes no sense to use a forking server if the service +contains state in memory that can be modified by requests (since the +modifications in the child process would never reach the initial state +kept in the parent process and passed to each child). In this case, +you can use a threading server, but you will probably have to use +locks to avoid two requests that come in nearly simultaneous to apply +conflicting changes to the server state. + +On the other hand, if you are building e.g. an HTTP server, where all +data is stored externally (e.g. in the file system), a synchronous +class will essentially render the service "deaf" while one request is +being handled -- which may be for a very long time if a client is slow +to read all the data it has requested. Here a threading or forking +server is appropriate. + +In some cases, it may be appropriate to process part of a request +synchronously, but to finish processing in a forked child depending on +the request data. This can be implemented by using a synchronous +server and doing an explicit fork in the request handler class +handle() method. + +Another approach to handling multiple simultaneous requests in an +environment that supports neither threads nor fork (or where these are +too expensive or inappropriate for the service) is to maintain an +explicit table of partially finished requests and to use a selector to +decide which request to work on next (or whether to handle a new +incoming request). This is particularly important for stream services +where each client can potentially be connected for a long time (if +threads or subprocesses cannot be used). + +Future work: +- Standard classes for Sun RPC (which uses either UDP or TCP) +- Standard mix-in classes to implement various authentication + and encryption schemes + +XXX Open problems: +- What to do with out-of-band data? + +BaseServer: +- split generic "request" functionality out into BaseServer class. + Copyright (C) 2000 Luke Kenneth Casson Leighton + + example: read entries from a SQL database (requires overriding + get_request() to return a table entry from the database). + entry is processed by a RequestHandlerClass. + +""" + +# Author of the BaseServer patch: Luke Kenneth Casson Leighton + +__version__ = "0.4" + + +import socket +import selectors +import os +import errno +import sys +try: + import threading +except ImportError: + import dummy_threading as threading +from io import BufferedIOBase +from time import monotonic as time + +__all__ = ["BaseServer", "TCPServer", "UDPServer", + "ThreadingUDPServer", "ThreadingTCPServer", + "BaseRequestHandler", "StreamRequestHandler", + "DatagramRequestHandler", "ThreadingMixIn"] +if hasattr(os, "fork"): + __all__.extend(["ForkingUDPServer","ForkingTCPServer", "ForkingMixIn"]) +if hasattr(socket, "AF_UNIX"): + __all__.extend(["UnixStreamServer","UnixDatagramServer", + "ThreadingUnixStreamServer", + "ThreadingUnixDatagramServer"]) + +# poll/select have the advantage of not requiring any extra file descriptor, +# contrarily to epoll/kqueue (also, they require a single syscall). +if hasattr(selectors, 'PollSelector'): + _ServerSelector = selectors.PollSelector +else: + _ServerSelector = selectors.SelectSelector + + +class BaseServer: + + """Base class for server classes. + + Methods for the caller: + + - __init__(server_address, RequestHandlerClass) + - serve_forever(poll_interval=0.5) + - shutdown() + - handle_request() # if you do not use serve_forever() + - fileno() -> int # for selector + + Methods that may be overridden: + + - server_bind() + - server_activate() + - get_request() -> request, client_address + - handle_timeout() + - verify_request(request, client_address) + - server_close() + - process_request(request, client_address) + - shutdown_request(request) + - close_request(request) + - service_actions() + - handle_error() + + Methods for derived classes: + + - finish_request(request, client_address) + + Class variables that may be overridden by derived classes or + instances: + + - timeout + - address_family + - socket_type + - allow_reuse_address + + Instance variables: + + - RequestHandlerClass + - socket + + """ + + timeout = None + + def __init__(self, server_address, RequestHandlerClass): + """Constructor. May be extended, do not override.""" + self.server_address = server_address + self.RequestHandlerClass = RequestHandlerClass + self.__is_shut_down = threading.Event() + self.__shutdown_request = False + + def server_activate(self): + """Called by constructor to activate the server. + + May be overridden. + + """ + pass + + def serve_forever(self, poll_interval=0.5): + """Handle one request at a time until shutdown. + + Polls for shutdown every poll_interval seconds. Ignores + self.timeout. If you need to do periodic tasks, do them in + another thread. + """ + self.__is_shut_down.clear() + try: + # XXX: Consider using another file descriptor or connecting to the + # socket to wake this up instead of polling. Polling reduces our + # responsiveness to a shutdown request and wastes cpu at all other + # times. + with _ServerSelector() as selector: + selector.register(self, selectors.EVENT_READ) + + while not self.__shutdown_request: + ready = selector.select(poll_interval) + if ready: + self._handle_request_noblock() + + self.service_actions() + finally: + self.__shutdown_request = False + self.__is_shut_down.set() + + def shutdown(self): + """Stops the serve_forever loop. + + Blocks until the loop has finished. This must be called while + serve_forever() is running in another thread, or it will + deadlock. + """ + self.__shutdown_request = True + self.__is_shut_down.wait() + + def service_actions(self): + """Called by the serve_forever() loop. + + May be overridden by a subclass / Mixin to implement any code that + needs to be run during the loop. + """ + pass + + # The distinction between handling, getting, processing and finishing a + # request is fairly arbitrary. Remember: + # + # - handle_request() is the top-level call. It calls selector.select(), + # get_request(), verify_request() and process_request() + # - get_request() is different for stream or datagram sockets + # - process_request() is the place that may fork a new process or create a + # new thread to finish the request + # - finish_request() instantiates the request handler class; this + # constructor will handle the request all by itself + + def handle_request(self): + """Handle one request, possibly blocking. + + Respects self.timeout. + """ + # Support people who used socket.settimeout() to escape + # handle_request before self.timeout was available. + timeout = self.socket.gettimeout() + if timeout is None: + timeout = self.timeout + elif self.timeout is not None: + timeout = min(timeout, self.timeout) + if timeout is not None: + deadline = time() + timeout + + # Wait until a request arrives or the timeout expires - the loop is + # necessary to accommodate early wakeups due to EINTR. + with _ServerSelector() as selector: + selector.register(self, selectors.EVENT_READ) + + while True: + ready = selector.select(timeout) + if ready: + return self._handle_request_noblock() + else: + if timeout is not None: + timeout = deadline - time() + if timeout < 0: + return self.handle_timeout() + + def _handle_request_noblock(self): + """Handle one request, without blocking. + + I assume that selector.select() has returned that the socket is + readable before this function was called, so there should be no risk of + blocking in get_request(). + """ + try: + request, client_address = self.get_request() + except OSError: + return + if self.verify_request(request, client_address): + try: + self.process_request(request, client_address) + except Exception: + self.handle_error(request, client_address) + self.shutdown_request(request) + except: + self.shutdown_request(request) + raise + else: + self.shutdown_request(request) + + def handle_timeout(self): + """Called if no new request arrives within self.timeout. + + Overridden by ForkingMixIn. + """ + pass + + def verify_request(self, request, client_address): + """Verify the request. May be overridden. + + Return True if we should proceed with this request. + + """ + return True + + def process_request(self, request, client_address): + """Call finish_request. + + Overridden by ForkingMixIn and ThreadingMixIn. + + """ + self.finish_request(request, client_address) + self.shutdown_request(request) + + def server_close(self): + """Called to clean-up the server. + + May be overridden. + + """ + pass + + def finish_request(self, request, client_address): + """Finish one request by instantiating RequestHandlerClass.""" + self.RequestHandlerClass(request, client_address, self) + + def shutdown_request(self, request): + """Called to shutdown and close an individual request.""" + self.close_request(request) + + def close_request(self, request): + """Called to clean up an individual request.""" + pass + + def handle_error(self, request, client_address): + """Handle an error gracefully. May be overridden. + + The default is to print a traceback and continue. + + """ + print('-'*40, file=sys.stderr) + print('Exception happened during processing of request from', + client_address, file=sys.stderr) + import traceback + traceback.print_exc() + print('-'*40, file=sys.stderr) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.server_close() + + +class TCPServer(BaseServer): + + """Base class for various socket-based server classes. + + Defaults to synchronous IP stream (i.e., TCP). + + Methods for the caller: + + - __init__(server_address, RequestHandlerClass, bind_and_activate=True) + - serve_forever(poll_interval=0.5) + - shutdown() + - handle_request() # if you don't use serve_forever() + - fileno() -> int # for selector + + Methods that may be overridden: + + - server_bind() + - server_activate() + - get_request() -> request, client_address + - handle_timeout() + - verify_request(request, client_address) + - process_request(request, client_address) + - shutdown_request(request) + - close_request(request) + - handle_error() + + Methods for derived classes: + + - finish_request(request, client_address) + + Class variables that may be overridden by derived classes or + instances: + + - timeout + - address_family + - socket_type + - request_queue_size (only for stream sockets) + - allow_reuse_address + + Instance variables: + + - server_address + - RequestHandlerClass + - socket + + """ + + address_family = socket.AF_INET + + socket_type = socket.SOCK_STREAM + + request_queue_size = 5 + + allow_reuse_address = False + + def __init__(self, server_address, RequestHandlerClass, bind_and_activate=True): + """Constructor. May be extended, do not override.""" + BaseServer.__init__(self, server_address, RequestHandlerClass) + self.socket = socket.socket(self.address_family, + self.socket_type) + if bind_and_activate: + try: + self.server_bind() + self.server_activate() + except: + self.server_close() + raise + + def server_bind(self): + """Called by constructor to bind the socket. + + May be overridden. + + """ + if self.allow_reuse_address: + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.socket.bind(self.server_address) + self.server_address = self.socket.getsockname() + + def server_activate(self): + """Called by constructor to activate the server. + + May be overridden. + + """ + self.socket.listen(self.request_queue_size) + + def server_close(self): + """Called to clean-up the server. + + May be overridden. + + """ + self.socket.close() + + def fileno(self): + """Return socket file number. + + Interface required by selector. + + """ + return self.socket.fileno() + + def get_request(self): + """Get the request and client address from the socket. + + May be overridden. + + """ + return self.socket.accept() + + def shutdown_request(self, request): + """Called to shutdown and close an individual request.""" + try: + #explicitly shutdown. socket.close() merely releases + #the socket and waits for GC to perform the actual close. + request.shutdown(socket.SHUT_WR) + except OSError: + pass #some platforms may raise ENOTCONN here + self.close_request(request) + + def close_request(self, request): + """Called to clean up an individual request.""" + request.close() + + +class UDPServer(TCPServer): + + """UDP server class.""" + + allow_reuse_address = False + + socket_type = socket.SOCK_DGRAM + + max_packet_size = 8192 + + def get_request(self): + data, client_addr = self.socket.recvfrom(self.max_packet_size) + return (data, self.socket), client_addr + + def server_activate(self): + # No need to call listen() for UDP. + pass + + def shutdown_request(self, request): + # No need to shutdown anything. + self.close_request(request) + + def close_request(self, request): + # No need to close anything. + pass + +if hasattr(os, "fork"): + class ForkingMixIn: + """Mix-in class to handle each request in a new process.""" + + timeout = 300 + active_children = None + max_children = 40 + + def collect_children(self): + """Internal routine to wait for children that have exited.""" + if self.active_children is None: + return + + # If we're above the max number of children, wait and reap them until + # we go back below threshold. Note that we use waitpid(-1) below to be + # able to collect children in size() syscalls instead + # of size(): the downside is that this might reap children + # which we didn't spawn, which is why we only resort to this when we're + # above max_children. + while len(self.active_children) >= self.max_children: + try: + pid, _ = os.waitpid(-1, 0) + self.active_children.discard(pid) + except ChildProcessError: + # we don't have any children, we're done + self.active_children.clear() + except OSError: + break + + # Now reap all defunct children. + for pid in self.active_children.copy(): + try: + pid, _ = os.waitpid(pid, os.WNOHANG) + # if the child hasn't exited yet, pid will be 0 and ignored by + # discard() below + self.active_children.discard(pid) + except ChildProcessError: + # someone else reaped it + self.active_children.discard(pid) + except OSError: + pass + + def handle_timeout(self): + """Wait for zombies after self.timeout seconds of inactivity. + + May be extended, do not override. + """ + self.collect_children() + + def service_actions(self): + """Collect the zombie child processes regularly in the ForkingMixIn. + + service_actions is called in the BaseServer's serve_forver loop. + """ + self.collect_children() + + def process_request(self, request, client_address): + """Fork a new subprocess to process the request.""" + pid = os.fork() + if pid: + # Parent process + if self.active_children is None: + self.active_children = set() + self.active_children.add(pid) + self.close_request(request) + return + else: + # Child process. + # This must never return, hence os._exit()! + status = 1 + try: + self.finish_request(request, client_address) + status = 0 + except Exception: + self.handle_error(request, client_address) + finally: + try: + self.shutdown_request(request) + finally: + os._exit(status) + + +class ThreadingMixIn: + """Mix-in class to handle each request in a new thread.""" + + # Decides how threads will act upon termination of the + # main process + daemon_threads = False + + def process_request_thread(self, request, client_address): + """Same as in BaseServer but as a thread. + + In addition, exception handling is done here. + + """ + try: + self.finish_request(request, client_address) + except Exception: + self.handle_error(request, client_address) + finally: + self.shutdown_request(request) + + def process_request(self, request, client_address): + """Start a new thread to process the request.""" + t = threading.Thread(target = self.process_request_thread, + args = (request, client_address)) + t.daemon = self.daemon_threads + t.start() + + +if hasattr(os, "fork"): + class ForkingUDPServer(ForkingMixIn, UDPServer): pass + class ForkingTCPServer(ForkingMixIn, TCPServer): pass + +class ThreadingUDPServer(ThreadingMixIn, UDPServer): pass +class ThreadingTCPServer(ThreadingMixIn, TCPServer): pass + +if hasattr(socket, 'AF_UNIX'): + + class UnixStreamServer(TCPServer): + address_family = socket.AF_UNIX + + class UnixDatagramServer(UDPServer): + address_family = socket.AF_UNIX + + class ThreadingUnixStreamServer(ThreadingMixIn, UnixStreamServer): pass + + class ThreadingUnixDatagramServer(ThreadingMixIn, UnixDatagramServer): pass + +class BaseRequestHandler: + + """Base class for request handler classes. + + This class is instantiated for each request to be handled. The + constructor sets the instance variables request, client_address + and server, and then calls the handle() method. To implement a + specific service, all you need to do is to derive a class which + defines a handle() method. + + The handle() method can find the request as self.request, the + client address as self.client_address, and the server (in case it + needs access to per-server information) as self.server. Since a + separate instance is created for each request, the handle() method + can define other arbitrary instance variables. + + """ + + def __init__(self, request, client_address, server): + self.request = request + self.client_address = client_address + self.server = server + self.setup() + try: + self.handle() + finally: + self.finish() + + def setup(self): + pass + + def handle(self): + pass + + def finish(self): + pass + + +# The following two classes make it possible to use the same service +# class for stream or datagram servers. +# Each class sets up these instance variables: +# - rfile: a file object from which receives the request is read +# - wfile: a file object to which the reply is written +# When the handle() method returns, wfile is flushed properly + + +class StreamRequestHandler(BaseRequestHandler): + + """Define self.rfile and self.wfile for stream sockets.""" + + # Default buffer sizes for rfile, wfile. + # We default rfile to buffered because otherwise it could be + # really slow for large data (a getc() call per byte); we make + # wfile unbuffered because (a) often after a write() we want to + # read and we need to flush the line; (b) big writes to unbuffered + # files are typically optimized by stdio even when big reads + # aren't. + rbufsize = -1 + wbufsize = 0 + + # A timeout to apply to the request socket, if not None. + timeout = None + + # Disable nagle algorithm for this socket, if True. + # Use only when wbufsize != 0, to avoid small packets. + disable_nagle_algorithm = False + + def setup(self): + self.connection = self.request + if self.timeout is not None: + self.connection.settimeout(self.timeout) + if self.disable_nagle_algorithm: + self.connection.setsockopt(socket.IPPROTO_TCP, + socket.TCP_NODELAY, True) + self.rfile = self.connection.makefile('rb', self.rbufsize) + if self.wbufsize == 0: + self.wfile = _SocketWriter(self.connection) + else: + self.wfile = self.connection.makefile('wb', self.wbufsize) + + def finish(self): + if not self.wfile.closed: + try: + self.wfile.flush() + except socket.error: + # A final socket error may have occurred here, such as + # the local error ECONNABORTED. + pass + self.wfile.close() + self.rfile.close() + +class _SocketWriter(BufferedIOBase): + """Simple writable BufferedIOBase implementation for a socket + + Does not hold data in a buffer, avoiding any need to call flush().""" + + def __init__(self, sock): + self._sock = sock + + def writable(self): + return True + + def write(self, b): + self._sock.sendall(b) + with memoryview(b) as view: + return view.nbytes + + def fileno(self): + return self._sock.fileno() + +class DatagramRequestHandler(BaseRequestHandler): + + """Define self.rfile and self.wfile for datagram sockets.""" + + def setup(self): + from io import BytesIO + self.packet, self.socket = self.request + self.rfile = BytesIO(self.packet) + self.wfile = BytesIO() + + def finish(self): + self.socket.sendto(self.wfile.getvalue(), self.client_address) diff --git a/Lib/stringprep.py b/Lib/stringprep.py new file mode 100644 index 0000000000..44ecdb266c --- /dev/null +++ b/Lib/stringprep.py @@ -0,0 +1,272 @@ +# This file is generated by mkstringprep.py. DO NOT EDIT. +"""Library that exposes various tables found in the StringPrep RFC 3454. + +There are two kinds of tables: sets, for which a member test is provided, +and mappings, for which a mapping function is provided. +""" + +from unicodedata import ucd_3_2_0 as unicodedata + +assert unicodedata.unidata_version == '3.2.0' + +def in_table_a1(code): + if unicodedata.category(code) != 'Cn': return False + c = ord(code) + if 0xFDD0 <= c < 0xFDF0: return False + return (c & 0xFFFF) not in (0xFFFE, 0xFFFF) + + +b1_set = set([173, 847, 6150, 6155, 6156, 6157, 8203, 8204, 8205, 8288, 65279] + list(range(65024,65040))) +def in_table_b1(code): + return ord(code) in b1_set + + +b3_exceptions = { +0xb5:'\u03bc', 0xdf:'ss', 0x130:'i\u0307', 0x149:'\u02bcn', +0x17f:'s', 0x1f0:'j\u030c', 0x345:'\u03b9', 0x37a:' \u03b9', +0x390:'\u03b9\u0308\u0301', 0x3b0:'\u03c5\u0308\u0301', 0x3c2:'\u03c3', 0x3d0:'\u03b2', +0x3d1:'\u03b8', 0x3d2:'\u03c5', 0x3d3:'\u03cd', 0x3d4:'\u03cb', +0x3d5:'\u03c6', 0x3d6:'\u03c0', 0x3f0:'\u03ba', 0x3f1:'\u03c1', +0x3f2:'\u03c3', 0x3f5:'\u03b5', 0x587:'\u0565\u0582', 0x1e96:'h\u0331', +0x1e97:'t\u0308', 0x1e98:'w\u030a', 0x1e99:'y\u030a', 0x1e9a:'a\u02be', +0x1e9b:'\u1e61', 0x1f50:'\u03c5\u0313', 0x1f52:'\u03c5\u0313\u0300', 0x1f54:'\u03c5\u0313\u0301', +0x1f56:'\u03c5\u0313\u0342', 0x1f80:'\u1f00\u03b9', 0x1f81:'\u1f01\u03b9', 0x1f82:'\u1f02\u03b9', +0x1f83:'\u1f03\u03b9', 0x1f84:'\u1f04\u03b9', 0x1f85:'\u1f05\u03b9', 0x1f86:'\u1f06\u03b9', +0x1f87:'\u1f07\u03b9', 0x1f88:'\u1f00\u03b9', 0x1f89:'\u1f01\u03b9', 0x1f8a:'\u1f02\u03b9', +0x1f8b:'\u1f03\u03b9', 0x1f8c:'\u1f04\u03b9', 0x1f8d:'\u1f05\u03b9', 0x1f8e:'\u1f06\u03b9', +0x1f8f:'\u1f07\u03b9', 0x1f90:'\u1f20\u03b9', 0x1f91:'\u1f21\u03b9', 0x1f92:'\u1f22\u03b9', +0x1f93:'\u1f23\u03b9', 0x1f94:'\u1f24\u03b9', 0x1f95:'\u1f25\u03b9', 0x1f96:'\u1f26\u03b9', +0x1f97:'\u1f27\u03b9', 0x1f98:'\u1f20\u03b9', 0x1f99:'\u1f21\u03b9', 0x1f9a:'\u1f22\u03b9', +0x1f9b:'\u1f23\u03b9', 0x1f9c:'\u1f24\u03b9', 0x1f9d:'\u1f25\u03b9', 0x1f9e:'\u1f26\u03b9', +0x1f9f:'\u1f27\u03b9', 0x1fa0:'\u1f60\u03b9', 0x1fa1:'\u1f61\u03b9', 0x1fa2:'\u1f62\u03b9', +0x1fa3:'\u1f63\u03b9', 0x1fa4:'\u1f64\u03b9', 0x1fa5:'\u1f65\u03b9', 0x1fa6:'\u1f66\u03b9', +0x1fa7:'\u1f67\u03b9', 0x1fa8:'\u1f60\u03b9', 0x1fa9:'\u1f61\u03b9', 0x1faa:'\u1f62\u03b9', +0x1fab:'\u1f63\u03b9', 0x1fac:'\u1f64\u03b9', 0x1fad:'\u1f65\u03b9', 0x1fae:'\u1f66\u03b9', +0x1faf:'\u1f67\u03b9', 0x1fb2:'\u1f70\u03b9', 0x1fb3:'\u03b1\u03b9', 0x1fb4:'\u03ac\u03b9', +0x1fb6:'\u03b1\u0342', 0x1fb7:'\u03b1\u0342\u03b9', 0x1fbc:'\u03b1\u03b9', 0x1fbe:'\u03b9', +0x1fc2:'\u1f74\u03b9', 0x1fc3:'\u03b7\u03b9', 0x1fc4:'\u03ae\u03b9', 0x1fc6:'\u03b7\u0342', +0x1fc7:'\u03b7\u0342\u03b9', 0x1fcc:'\u03b7\u03b9', 0x1fd2:'\u03b9\u0308\u0300', 0x1fd3:'\u03b9\u0308\u0301', +0x1fd6:'\u03b9\u0342', 0x1fd7:'\u03b9\u0308\u0342', 0x1fe2:'\u03c5\u0308\u0300', 0x1fe3:'\u03c5\u0308\u0301', +0x1fe4:'\u03c1\u0313', 0x1fe6:'\u03c5\u0342', 0x1fe7:'\u03c5\u0308\u0342', 0x1ff2:'\u1f7c\u03b9', +0x1ff3:'\u03c9\u03b9', 0x1ff4:'\u03ce\u03b9', 0x1ff6:'\u03c9\u0342', 0x1ff7:'\u03c9\u0342\u03b9', +0x1ffc:'\u03c9\u03b9', 0x20a8:'rs', 0x2102:'c', 0x2103:'\xb0c', +0x2107:'\u025b', 0x2109:'\xb0f', 0x210b:'h', 0x210c:'h', +0x210d:'h', 0x2110:'i', 0x2111:'i', 0x2112:'l', +0x2115:'n', 0x2116:'no', 0x2119:'p', 0x211a:'q', +0x211b:'r', 0x211c:'r', 0x211d:'r', 0x2120:'sm', +0x2121:'tel', 0x2122:'tm', 0x2124:'z', 0x2128:'z', +0x212c:'b', 0x212d:'c', 0x2130:'e', 0x2131:'f', +0x2133:'m', 0x213e:'\u03b3', 0x213f:'\u03c0', 0x2145:'d', +0x3371:'hpa', 0x3373:'au', 0x3375:'ov', 0x3380:'pa', +0x3381:'na', 0x3382:'\u03bca', 0x3383:'ma', 0x3384:'ka', +0x3385:'kb', 0x3386:'mb', 0x3387:'gb', 0x338a:'pf', +0x338b:'nf', 0x338c:'\u03bcf', 0x3390:'hz', 0x3391:'khz', +0x3392:'mhz', 0x3393:'ghz', 0x3394:'thz', 0x33a9:'pa', +0x33aa:'kpa', 0x33ab:'mpa', 0x33ac:'gpa', 0x33b4:'pv', +0x33b5:'nv', 0x33b6:'\u03bcv', 0x33b7:'mv', 0x33b8:'kv', +0x33b9:'mv', 0x33ba:'pw', 0x33bb:'nw', 0x33bc:'\u03bcw', +0x33bd:'mw', 0x33be:'kw', 0x33bf:'mw', 0x33c0:'k\u03c9', +0x33c1:'m\u03c9', 0x33c3:'bq', 0x33c6:'c\u2215kg', 0x33c7:'co.', +0x33c8:'db', 0x33c9:'gy', 0x33cb:'hp', 0x33cd:'kk', +0x33ce:'km', 0x33d7:'ph', 0x33d9:'ppm', 0x33da:'pr', +0x33dc:'sv', 0x33dd:'wb', 0xfb00:'ff', 0xfb01:'fi', +0xfb02:'fl', 0xfb03:'ffi', 0xfb04:'ffl', 0xfb05:'st', +0xfb06:'st', 0xfb13:'\u0574\u0576', 0xfb14:'\u0574\u0565', 0xfb15:'\u0574\u056b', +0xfb16:'\u057e\u0576', 0xfb17:'\u0574\u056d', 0x1d400:'a', 0x1d401:'b', +0x1d402:'c', 0x1d403:'d', 0x1d404:'e', 0x1d405:'f', +0x1d406:'g', 0x1d407:'h', 0x1d408:'i', 0x1d409:'j', +0x1d40a:'k', 0x1d40b:'l', 0x1d40c:'m', 0x1d40d:'n', +0x1d40e:'o', 0x1d40f:'p', 0x1d410:'q', 0x1d411:'r', +0x1d412:'s', 0x1d413:'t', 0x1d414:'u', 0x1d415:'v', +0x1d416:'w', 0x1d417:'x', 0x1d418:'y', 0x1d419:'z', +0x1d434:'a', 0x1d435:'b', 0x1d436:'c', 0x1d437:'d', +0x1d438:'e', 0x1d439:'f', 0x1d43a:'g', 0x1d43b:'h', +0x1d43c:'i', 0x1d43d:'j', 0x1d43e:'k', 0x1d43f:'l', +0x1d440:'m', 0x1d441:'n', 0x1d442:'o', 0x1d443:'p', +0x1d444:'q', 0x1d445:'r', 0x1d446:'s', 0x1d447:'t', +0x1d448:'u', 0x1d449:'v', 0x1d44a:'w', 0x1d44b:'x', +0x1d44c:'y', 0x1d44d:'z', 0x1d468:'a', 0x1d469:'b', +0x1d46a:'c', 0x1d46b:'d', 0x1d46c:'e', 0x1d46d:'f', +0x1d46e:'g', 0x1d46f:'h', 0x1d470:'i', 0x1d471:'j', +0x1d472:'k', 0x1d473:'l', 0x1d474:'m', 0x1d475:'n', +0x1d476:'o', 0x1d477:'p', 0x1d478:'q', 0x1d479:'r', +0x1d47a:'s', 0x1d47b:'t', 0x1d47c:'u', 0x1d47d:'v', +0x1d47e:'w', 0x1d47f:'x', 0x1d480:'y', 0x1d481:'z', +0x1d49c:'a', 0x1d49e:'c', 0x1d49f:'d', 0x1d4a2:'g', +0x1d4a5:'j', 0x1d4a6:'k', 0x1d4a9:'n', 0x1d4aa:'o', +0x1d4ab:'p', 0x1d4ac:'q', 0x1d4ae:'s', 0x1d4af:'t', +0x1d4b0:'u', 0x1d4b1:'v', 0x1d4b2:'w', 0x1d4b3:'x', +0x1d4b4:'y', 0x1d4b5:'z', 0x1d4d0:'a', 0x1d4d1:'b', +0x1d4d2:'c', 0x1d4d3:'d', 0x1d4d4:'e', 0x1d4d5:'f', +0x1d4d6:'g', 0x1d4d7:'h', 0x1d4d8:'i', 0x1d4d9:'j', +0x1d4da:'k', 0x1d4db:'l', 0x1d4dc:'m', 0x1d4dd:'n', +0x1d4de:'o', 0x1d4df:'p', 0x1d4e0:'q', 0x1d4e1:'r', +0x1d4e2:'s', 0x1d4e3:'t', 0x1d4e4:'u', 0x1d4e5:'v', +0x1d4e6:'w', 0x1d4e7:'x', 0x1d4e8:'y', 0x1d4e9:'z', +0x1d504:'a', 0x1d505:'b', 0x1d507:'d', 0x1d508:'e', +0x1d509:'f', 0x1d50a:'g', 0x1d50d:'j', 0x1d50e:'k', +0x1d50f:'l', 0x1d510:'m', 0x1d511:'n', 0x1d512:'o', +0x1d513:'p', 0x1d514:'q', 0x1d516:'s', 0x1d517:'t', +0x1d518:'u', 0x1d519:'v', 0x1d51a:'w', 0x1d51b:'x', +0x1d51c:'y', 0x1d538:'a', 0x1d539:'b', 0x1d53b:'d', +0x1d53c:'e', 0x1d53d:'f', 0x1d53e:'g', 0x1d540:'i', +0x1d541:'j', 0x1d542:'k', 0x1d543:'l', 0x1d544:'m', +0x1d546:'o', 0x1d54a:'s', 0x1d54b:'t', 0x1d54c:'u', +0x1d54d:'v', 0x1d54e:'w', 0x1d54f:'x', 0x1d550:'y', +0x1d56c:'a', 0x1d56d:'b', 0x1d56e:'c', 0x1d56f:'d', +0x1d570:'e', 0x1d571:'f', 0x1d572:'g', 0x1d573:'h', +0x1d574:'i', 0x1d575:'j', 0x1d576:'k', 0x1d577:'l', +0x1d578:'m', 0x1d579:'n', 0x1d57a:'o', 0x1d57b:'p', +0x1d57c:'q', 0x1d57d:'r', 0x1d57e:'s', 0x1d57f:'t', +0x1d580:'u', 0x1d581:'v', 0x1d582:'w', 0x1d583:'x', +0x1d584:'y', 0x1d585:'z', 0x1d5a0:'a', 0x1d5a1:'b', +0x1d5a2:'c', 0x1d5a3:'d', 0x1d5a4:'e', 0x1d5a5:'f', +0x1d5a6:'g', 0x1d5a7:'h', 0x1d5a8:'i', 0x1d5a9:'j', +0x1d5aa:'k', 0x1d5ab:'l', 0x1d5ac:'m', 0x1d5ad:'n', +0x1d5ae:'o', 0x1d5af:'p', 0x1d5b0:'q', 0x1d5b1:'r', +0x1d5b2:'s', 0x1d5b3:'t', 0x1d5b4:'u', 0x1d5b5:'v', +0x1d5b6:'w', 0x1d5b7:'x', 0x1d5b8:'y', 0x1d5b9:'z', +0x1d5d4:'a', 0x1d5d5:'b', 0x1d5d6:'c', 0x1d5d7:'d', +0x1d5d8:'e', 0x1d5d9:'f', 0x1d5da:'g', 0x1d5db:'h', +0x1d5dc:'i', 0x1d5dd:'j', 0x1d5de:'k', 0x1d5df:'l', +0x1d5e0:'m', 0x1d5e1:'n', 0x1d5e2:'o', 0x1d5e3:'p', +0x1d5e4:'q', 0x1d5e5:'r', 0x1d5e6:'s', 0x1d5e7:'t', +0x1d5e8:'u', 0x1d5e9:'v', 0x1d5ea:'w', 0x1d5eb:'x', +0x1d5ec:'y', 0x1d5ed:'z', 0x1d608:'a', 0x1d609:'b', +0x1d60a:'c', 0x1d60b:'d', 0x1d60c:'e', 0x1d60d:'f', +0x1d60e:'g', 0x1d60f:'h', 0x1d610:'i', 0x1d611:'j', +0x1d612:'k', 0x1d613:'l', 0x1d614:'m', 0x1d615:'n', +0x1d616:'o', 0x1d617:'p', 0x1d618:'q', 0x1d619:'r', +0x1d61a:'s', 0x1d61b:'t', 0x1d61c:'u', 0x1d61d:'v', +0x1d61e:'w', 0x1d61f:'x', 0x1d620:'y', 0x1d621:'z', +0x1d63c:'a', 0x1d63d:'b', 0x1d63e:'c', 0x1d63f:'d', +0x1d640:'e', 0x1d641:'f', 0x1d642:'g', 0x1d643:'h', +0x1d644:'i', 0x1d645:'j', 0x1d646:'k', 0x1d647:'l', +0x1d648:'m', 0x1d649:'n', 0x1d64a:'o', 0x1d64b:'p', +0x1d64c:'q', 0x1d64d:'r', 0x1d64e:'s', 0x1d64f:'t', +0x1d650:'u', 0x1d651:'v', 0x1d652:'w', 0x1d653:'x', +0x1d654:'y', 0x1d655:'z', 0x1d670:'a', 0x1d671:'b', +0x1d672:'c', 0x1d673:'d', 0x1d674:'e', 0x1d675:'f', +0x1d676:'g', 0x1d677:'h', 0x1d678:'i', 0x1d679:'j', +0x1d67a:'k', 0x1d67b:'l', 0x1d67c:'m', 0x1d67d:'n', +0x1d67e:'o', 0x1d67f:'p', 0x1d680:'q', 0x1d681:'r', +0x1d682:'s', 0x1d683:'t', 0x1d684:'u', 0x1d685:'v', +0x1d686:'w', 0x1d687:'x', 0x1d688:'y', 0x1d689:'z', +0x1d6a8:'\u03b1', 0x1d6a9:'\u03b2', 0x1d6aa:'\u03b3', 0x1d6ab:'\u03b4', +0x1d6ac:'\u03b5', 0x1d6ad:'\u03b6', 0x1d6ae:'\u03b7', 0x1d6af:'\u03b8', +0x1d6b0:'\u03b9', 0x1d6b1:'\u03ba', 0x1d6b2:'\u03bb', 0x1d6b3:'\u03bc', +0x1d6b4:'\u03bd', 0x1d6b5:'\u03be', 0x1d6b6:'\u03bf', 0x1d6b7:'\u03c0', +0x1d6b8:'\u03c1', 0x1d6b9:'\u03b8', 0x1d6ba:'\u03c3', 0x1d6bb:'\u03c4', +0x1d6bc:'\u03c5', 0x1d6bd:'\u03c6', 0x1d6be:'\u03c7', 0x1d6bf:'\u03c8', +0x1d6c0:'\u03c9', 0x1d6d3:'\u03c3', 0x1d6e2:'\u03b1', 0x1d6e3:'\u03b2', +0x1d6e4:'\u03b3', 0x1d6e5:'\u03b4', 0x1d6e6:'\u03b5', 0x1d6e7:'\u03b6', +0x1d6e8:'\u03b7', 0x1d6e9:'\u03b8', 0x1d6ea:'\u03b9', 0x1d6eb:'\u03ba', +0x1d6ec:'\u03bb', 0x1d6ed:'\u03bc', 0x1d6ee:'\u03bd', 0x1d6ef:'\u03be', +0x1d6f0:'\u03bf', 0x1d6f1:'\u03c0', 0x1d6f2:'\u03c1', 0x1d6f3:'\u03b8', +0x1d6f4:'\u03c3', 0x1d6f5:'\u03c4', 0x1d6f6:'\u03c5', 0x1d6f7:'\u03c6', +0x1d6f8:'\u03c7', 0x1d6f9:'\u03c8', 0x1d6fa:'\u03c9', 0x1d70d:'\u03c3', +0x1d71c:'\u03b1', 0x1d71d:'\u03b2', 0x1d71e:'\u03b3', 0x1d71f:'\u03b4', +0x1d720:'\u03b5', 0x1d721:'\u03b6', 0x1d722:'\u03b7', 0x1d723:'\u03b8', +0x1d724:'\u03b9', 0x1d725:'\u03ba', 0x1d726:'\u03bb', 0x1d727:'\u03bc', +0x1d728:'\u03bd', 0x1d729:'\u03be', 0x1d72a:'\u03bf', 0x1d72b:'\u03c0', +0x1d72c:'\u03c1', 0x1d72d:'\u03b8', 0x1d72e:'\u03c3', 0x1d72f:'\u03c4', +0x1d730:'\u03c5', 0x1d731:'\u03c6', 0x1d732:'\u03c7', 0x1d733:'\u03c8', +0x1d734:'\u03c9', 0x1d747:'\u03c3', 0x1d756:'\u03b1', 0x1d757:'\u03b2', +0x1d758:'\u03b3', 0x1d759:'\u03b4', 0x1d75a:'\u03b5', 0x1d75b:'\u03b6', +0x1d75c:'\u03b7', 0x1d75d:'\u03b8', 0x1d75e:'\u03b9', 0x1d75f:'\u03ba', +0x1d760:'\u03bb', 0x1d761:'\u03bc', 0x1d762:'\u03bd', 0x1d763:'\u03be', +0x1d764:'\u03bf', 0x1d765:'\u03c0', 0x1d766:'\u03c1', 0x1d767:'\u03b8', +0x1d768:'\u03c3', 0x1d769:'\u03c4', 0x1d76a:'\u03c5', 0x1d76b:'\u03c6', +0x1d76c:'\u03c7', 0x1d76d:'\u03c8', 0x1d76e:'\u03c9', 0x1d781:'\u03c3', +0x1d790:'\u03b1', 0x1d791:'\u03b2', 0x1d792:'\u03b3', 0x1d793:'\u03b4', +0x1d794:'\u03b5', 0x1d795:'\u03b6', 0x1d796:'\u03b7', 0x1d797:'\u03b8', +0x1d798:'\u03b9', 0x1d799:'\u03ba', 0x1d79a:'\u03bb', 0x1d79b:'\u03bc', +0x1d79c:'\u03bd', 0x1d79d:'\u03be', 0x1d79e:'\u03bf', 0x1d79f:'\u03c0', +0x1d7a0:'\u03c1', 0x1d7a1:'\u03b8', 0x1d7a2:'\u03c3', 0x1d7a3:'\u03c4', +0x1d7a4:'\u03c5', 0x1d7a5:'\u03c6', 0x1d7a6:'\u03c7', 0x1d7a7:'\u03c8', +0x1d7a8:'\u03c9', 0x1d7bb:'\u03c3', } + +def map_table_b3(code): + r = b3_exceptions.get(ord(code)) + if r is not None: return r + return code.lower() + + +def map_table_b2(a): + al = map_table_b3(a) + b = unicodedata.normalize("NFKC", al) + bl = "".join([map_table_b3(ch) for ch in b]) + c = unicodedata.normalize("NFKC", bl) + if b != c: + return c + else: + return al + + +def in_table_c11(code): + return code == " " + + +def in_table_c12(code): + return unicodedata.category(code) == "Zs" and code != " " + +def in_table_c11_c12(code): + return unicodedata.category(code) == "Zs" + + +def in_table_c21(code): + return ord(code) < 128 and unicodedata.category(code) == "Cc" + +c22_specials = set([1757, 1807, 6158, 8204, 8205, 8232, 8233, 65279] + list(range(8288,8292)) + list(range(8298,8304)) + list(range(65529,65533)) + list(range(119155,119163))) +def in_table_c22(code): + c = ord(code) + if c < 128: return False + if unicodedata.category(code) == "Cc": return True + return c in c22_specials + +def in_table_c21_c22(code): + return unicodedata.category(code) == "Cc" or \ + ord(code) in c22_specials + + +def in_table_c3(code): + return unicodedata.category(code) == "Co" + + +def in_table_c4(code): + c = ord(code) + if c < 0xFDD0: return False + if c < 0xFDF0: return True + return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF) + + +def in_table_c5(code): + return unicodedata.category(code) == "Cs" + + +c6_set = set(range(65529,65534)) +def in_table_c6(code): + return ord(code) in c6_set + + +c7_set = set(range(12272,12284)) +def in_table_c7(code): + return ord(code) in c7_set + + +c8_set = set([832, 833, 8206, 8207] + list(range(8234,8239)) + list(range(8298,8304))) +def in_table_c8(code): + return ord(code) in c8_set + + +c9_set = set([917505] + list(range(917536,917632))) +def in_table_c9(code): + return ord(code) in c9_set + + +def in_table_d1(code): + return unicodedata.bidirectional(code) in ("R","AL") + + +def in_table_d2(code): + return unicodedata.bidirectional(code) == "L" diff --git a/Lib/urllib/__init__.py b/Lib/urllib/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Lib/urllib/error.py b/Lib/urllib/error.py new file mode 100644 index 0000000000..c5b675d161 --- /dev/null +++ b/Lib/urllib/error.py @@ -0,0 +1,81 @@ +"""Exception classes raised by urllib. + +The base exception class is URLError, which inherits from OSError. It +doesn't define any behavior of its own, but is the base class for all +exceptions defined in this package. + +HTTPError is an exception class that is also a valid HTTP response +instance. It behaves this way because HTTP protocol errors are valid +responses, with a status code, headers, and a body. In some contexts, +an application may want to handle an exception like a regular +response. +""" + +import urllib.response + +__all__ = ['URLError', 'HTTPError', 'ContentTooShortError'] + + +# do these error classes make sense? +# make sure all of the OSError stuff is overridden. we just want to be +# subtypes. + +class URLError(OSError): + # URLError is a sub-type of OSError, but it doesn't share any of + # the implementation. need to override __init__ and __str__. + # It sets self.args for compatibility with other EnvironmentError + # subclasses, but args doesn't have the typical format with errno in + # slot 0 and strerror in slot 1. This may be better than nothing. + def __init__(self, reason, filename=None): + self.args = reason, + self.reason = reason + if filename is not None: + self.filename = filename + + def __str__(self): + return '' % self.reason + + +class HTTPError(URLError, urllib.response.addinfourl): + """Raised when HTTP error occurs, but also acts like non-error return""" + __super_init = urllib.response.addinfourl.__init__ + + def __init__(self, url, code, msg, hdrs, fp): + self.code = code + self.msg = msg + self.hdrs = hdrs + self.fp = fp + self.filename = url + # The addinfourl classes depend on fp being a valid file + # object. In some cases, the HTTPError may not have a valid + # file object. If this happens, the simplest workaround is to + # not initialize the base classes. + if fp is not None: + self.__super_init(fp, hdrs, url, code) + + def __str__(self): + return 'HTTP Error %s: %s' % (self.code, self.msg) + + def __repr__(self): + return '' % (self.code, self.msg) + + # since URLError specifies a .reason attribute, HTTPError should also + # provide this attribute. See issue13211 for discussion. + @property + def reason(self): + return self.msg + + @property + def headers(self): + return self.hdrs + + @headers.setter + def headers(self, headers): + self.hdrs = headers + + +class ContentTooShortError(URLError): + """Exception raised when downloaded size does not match content-length.""" + def __init__(self, message, content): + URLError.__init__(self, message) + self.content = content diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py new file mode 100644 index 0000000000..958767a08d --- /dev/null +++ b/Lib/urllib/parse.py @@ -0,0 +1,1009 @@ +"""Parse (absolute and relative) URLs. + +urlparse module is based upon the following RFC specifications. + +RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding +and L. Masinter, January 2005. + +RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter +and L.Masinter, December 1999. + +RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. +Berners-Lee, R. Fielding, and L. Masinter, August 1998. + +RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. + +RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June +1995. + +RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. +McCahill, December 1994 + +RFC 3986 is considered the current standard and any future changes to +urlparse module should conform with it. The urlparse module is +currently not entirely compliant with this RFC due to defacto +scenarios for parsing, and for backward compatibility purposes, some +parsing quirks from older RFCs are retained. The testcases in +test_urlparse.py provides a good indicator of parsing behavior. +""" + +import re +import sys +import collections + +__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", + "urlsplit", "urlunsplit", "urlencode", "parse_qs", + "parse_qsl", "quote", "quote_plus", "quote_from_bytes", + "unquote", "unquote_plus", "unquote_to_bytes", + "DefragResult", "ParseResult", "SplitResult", + "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] + +# A classification of schemes ('' means apply by default) +uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', + 'wais', 'file', 'https', 'shttp', 'mms', + 'prospero', 'rtsp', 'rtspu', '', 'sftp', + 'svn', 'svn+ssh', 'ws', 'wss'] +uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', + 'imap', 'wais', 'file', 'mms', 'https', 'shttp', + 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', + 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', + 'ws', 'wss'] +uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', + 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', + 'mms', '', 'sftp', 'tel'] + +# These are not actually used anymore, but should stay for backwards +# compatibility. (They are undocumented, but have a public-looking name.) +non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', + 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] +uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', + 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] +uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', + 'nntp', 'wais', 'https', 'shttp', 'snews', + 'file', 'prospero', ''] + +# Characters valid in scheme names +scheme_chars = ('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789' + '+-.') + +# XXX: Consider replacing with functools.lru_cache +MAX_CACHE_SIZE = 20 +_parse_cache = {} + +def clear_cache(): + """Clear the parse cache and the quoters cache.""" + _parse_cache.clear() + _safe_quoters.clear() + + +# Helpers for bytes handling +# For 3.2, we deliberately require applications that +# handle improperly quoted URLs to do their own +# decoding and encoding. If valid use cases are +# presented, we may relax this by using latin-1 +# decoding internally for 3.3 +_implicit_encoding = 'ascii' +_implicit_errors = 'strict' + +def _noop(obj): + return obj + +def _encode_result(obj, encoding=_implicit_encoding, + errors=_implicit_errors): + return obj.encode(encoding, errors) + +def _decode_args(args, encoding=_implicit_encoding, + errors=_implicit_errors): + return tuple(x.decode(encoding, errors) if x else '' for x in args) + +def _coerce_args(*args): + # Invokes decode if necessary to create str args + # and returns the coerced inputs along with + # an appropriate result coercion function + # - noop for str inputs + # - encoding function otherwise + str_input = isinstance(args[0], str) + for arg in args[1:]: + # We special-case the empty string to support the + # "scheme=''" default argument to some functions + if arg and isinstance(arg, str) != str_input: + raise TypeError("Cannot mix str and non-str arguments") + if str_input: + return args + (_noop,) + return _decode_args(args) + (_encode_result,) + +# Result objects are more helpful than simple tuples +class _ResultMixinStr(object): + """Standard approach to encoding parsed results from str to bytes""" + __slots__ = () + + def encode(self, encoding='ascii', errors='strict'): + return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) + + +class _ResultMixinBytes(object): + """Standard approach to decoding parsed results from bytes to str""" + __slots__ = () + + def decode(self, encoding='ascii', errors='strict'): + return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) + + +class _NetlocResultMixinBase(object): + """Shared methods for the parsed result objects containing a netloc element""" + __slots__ = () + + @property + def username(self): + return self._userinfo[0] + + @property + def password(self): + return self._userinfo[1] + + @property + def hostname(self): + hostname = self._hostinfo[0] + if not hostname: + hostname = None + elif hostname is not None: + hostname = hostname.lower() + return hostname + + @property + def port(self): + port = self._hostinfo[1] + if port is not None: + port = int(port, 10) + if not ( 0 <= port <= 65535): + raise ValueError("Port out of range 0-65535") + return port + + +class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): + __slots__ = () + + @property + def _userinfo(self): + netloc = self.netloc + userinfo, have_info, hostinfo = netloc.rpartition('@') + if have_info: + username, have_password, password = userinfo.partition(':') + if not have_password: + password = None + else: + username = password = None + return username, password + + @property + def _hostinfo(self): + netloc = self.netloc + _, _, hostinfo = netloc.rpartition('@') + _, have_open_br, bracketed = hostinfo.partition('[') + if have_open_br: + hostname, _, port = bracketed.partition(']') + _, _, port = port.partition(':') + else: + hostname, _, port = hostinfo.partition(':') + if not port: + port = None + return hostname, port + + +class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): + __slots__ = () + + @property + def _userinfo(self): + netloc = self.netloc + userinfo, have_info, hostinfo = netloc.rpartition(b'@') + if have_info: + username, have_password, password = userinfo.partition(b':') + if not have_password: + password = None + else: + username = password = None + return username, password + + @property + def _hostinfo(self): + netloc = self.netloc + _, _, hostinfo = netloc.rpartition(b'@') + _, have_open_br, bracketed = hostinfo.partition(b'[') + if have_open_br: + hostname, _, port = bracketed.partition(b']') + _, _, port = port.partition(b':') + else: + hostname, _, port = hostinfo.partition(b':') + if not port: + port = None + return hostname, port + + +from collections import namedtuple + +_DefragResultBase = namedtuple('DefragResult', 'url fragment') +_SplitResultBase = namedtuple( + 'SplitResult', 'scheme netloc path query fragment') +_ParseResultBase = namedtuple( + 'ParseResult', 'scheme netloc path params query fragment') + +_DefragResultBase.__doc__ = """ +DefragResult(url, fragment) + +A 2-tuple that contains the url without fragment identifier and the fragment +identifier as a separate argument. +""" + +_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" + +_DefragResultBase.fragment.__doc__ = """ +Fragment identifier separated from URL, that allows indirect identification of a +secondary resource by reference to a primary resource and additional identifying +information. +""" + +_SplitResultBase.__doc__ = """ +SplitResult(scheme, netloc, path, query, fragment) + +A 5-tuple that contains the different components of a URL. Similar to +ParseResult, but does not split params. +""" + +_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" + +_SplitResultBase.netloc.__doc__ = """ +Network location where the request is made to. +""" + +_SplitResultBase.path.__doc__ = """ +The hierarchical path, such as the path to a file to download. +""" + +_SplitResultBase.query.__doc__ = """ +The query component, that contains non-hierarchical data, that along with data +in path component, identifies a resource in the scope of URI's scheme and +network location. +""" + +_SplitResultBase.fragment.__doc__ = """ +Fragment identifier, that allows indirect identification of a secondary resource +by reference to a primary resource and additional identifying information. +""" + +_ParseResultBase.__doc__ = """ +ParseResult(scheme, netloc, path, params, query, fragment) + +A 6-tuple that contains components of a parsed URL. +""" + +_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ +_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ +_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ +_ParseResultBase.params.__doc__ = """ +Parameters for last path element used to dereference the URI in order to provide +access to perform some operation on the resource. +""" + +_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ +_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ + + +# For backwards compatibility, alias _NetlocResultMixinStr +# ResultBase is no longer part of the documented API, but it is +# retained since deprecating it isn't worth the hassle +ResultBase = _NetlocResultMixinStr + +# Structured result objects for string data +class DefragResult(_DefragResultBase, _ResultMixinStr): + __slots__ = () + def geturl(self): + if self.fragment: + return self.url + '#' + self.fragment + else: + return self.url + +class SplitResult(_SplitResultBase, _NetlocResultMixinStr): + __slots__ = () + def geturl(self): + return urlunsplit(self) + +class ParseResult(_ParseResultBase, _NetlocResultMixinStr): + __slots__ = () + def geturl(self): + return urlunparse(self) + +# Structured result objects for bytes data +class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): + __slots__ = () + def geturl(self): + if self.fragment: + return self.url + b'#' + self.fragment + else: + return self.url + +class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): + __slots__ = () + def geturl(self): + return urlunsplit(self) + +class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): + __slots__ = () + def geturl(self): + return urlunparse(self) + +# Set up the encode/decode result pairs +def _fix_result_transcoding(): + _result_pairs = ( + (DefragResult, DefragResultBytes), + (SplitResult, SplitResultBytes), + (ParseResult, ParseResultBytes), + ) + for _decoded, _encoded in _result_pairs: + _decoded._encoded_counterpart = _encoded + _encoded._decoded_counterpart = _decoded + +_fix_result_transcoding() +del _fix_result_transcoding + +def urlparse(url, scheme='', allow_fragments=True): + """Parse a URL into 6 components: + :///;?# + Return a 6-tuple: (scheme, netloc, path, params, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + url, scheme, _coerce_result = _coerce_args(url, scheme) + splitresult = urlsplit(url, scheme, allow_fragments) + scheme, netloc, url, query, fragment = splitresult + if scheme in uses_params and ';' in url: + url, params = _splitparams(url) + else: + params = '' + result = ParseResult(scheme, netloc, url, params, query, fragment) + return _coerce_result(result) + +def _splitparams(url): + if '/' in url: + i = url.find(';', url.rfind('/')) + if i < 0: + return url, '' + else: + i = url.find(';') + return url[:i], url[i+1:] + +def _splitnetloc(url, start=0): + delim = len(url) # position of end of domain part of url, default is end + for c in '/?#': # look for delimiters; the order is NOT important + wdelim = url.find(c, start) # find first of this delim + if wdelim >= 0: # if found + delim = min(delim, wdelim) # use earliest delim position + return url[start:delim], url[delim:] # return (domain, rest) + +def urlsplit(url, scheme='', allow_fragments=True): + """Parse a URL into 5 components: + :///?# + Return a 5-tuple: (scheme, netloc, path, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + url, scheme, _coerce_result = _coerce_args(url, scheme) + allow_fragments = bool(allow_fragments) + key = url, scheme, allow_fragments, type(url), type(scheme) + cached = _parse_cache.get(key, None) + if cached: + return _coerce_result(cached) + if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth + clear_cache() + netloc = query = fragment = '' + i = url.find(':') + if i > 0: + if url[:i] == 'http': # optimize the common case + scheme = url[:i].lower() + url = url[i+1:] + if url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if (('[' in netloc and ']' not in netloc) or + (']' in netloc and '[' not in netloc)): + raise ValueError("Invalid IPv6 URL") + if allow_fragments and '#' in url: + url, fragment = url.split('#', 1) + if '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return _coerce_result(v) + for c in url[:i]: + if c not in scheme_chars: + break + else: + # make sure "url" is not actually a port number (in which case + # "scheme" is really part of the path) + rest = url[i+1:] + if not rest or any(c not in '0123456789' for c in rest): + # not a port number + scheme, url = url[:i].lower(), rest + + if url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if (('[' in netloc and ']' not in netloc) or + (']' in netloc and '[' not in netloc)): + raise ValueError("Invalid IPv6 URL") + if allow_fragments and '#' in url: + url, fragment = url.split('#', 1) + if '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return _coerce_result(v) + +def urlunparse(components): + """Put a parsed URL back together again. This may result in a + slightly different, but equivalent URL, if the URL that was parsed + originally had redundant delimiters, e.g. a ? with an empty query + (the draft states that these are equivalent).""" + scheme, netloc, url, params, query, fragment, _coerce_result = ( + _coerce_args(*components)) + if params: + url = "%s;%s" % (url, params) + return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) + +def urlunsplit(components): + """Combine the elements of a tuple as returned by urlsplit() into a + complete URL as a string. The data argument can be any five-item iterable. + This may result in a slightly different, but equivalent URL, if the URL that + was parsed originally had unnecessary delimiters (for example, a ? with an + empty query; the RFC states that these are equivalent).""" + scheme, netloc, url, query, fragment, _coerce_result = ( + _coerce_args(*components)) + if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if url and url[:1] != '/': url = '/' + url + url = '//' + (netloc or '') + url + if scheme: + url = scheme + ':' + url + if query: + url = url + '?' + query + if fragment: + url = url + '#' + fragment + return _coerce_result(url) + +def urljoin(base, url, allow_fragments=True): + """Join a base URL and a possibly relative URL to form an absolute + interpretation of the latter.""" + if not base: + return url + if not url: + return base + + base, url, _coerce_result = _coerce_args(base, url) + bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ + urlparse(base, '', allow_fragments) + scheme, netloc, path, params, query, fragment = \ + urlparse(url, bscheme, allow_fragments) + + if scheme != bscheme or scheme not in uses_relative: + return _coerce_result(url) + if scheme in uses_netloc: + if netloc: + return _coerce_result(urlunparse((scheme, netloc, path, + params, query, fragment))) + netloc = bnetloc + + if not path and not params: + path = bpath + params = bparams + if not query: + query = bquery + return _coerce_result(urlunparse((scheme, netloc, path, + params, query, fragment))) + + base_parts = bpath.split('/') + if base_parts[-1] != '': + # the last item is not a directory, so will not be taken into account + # in resolving the relative path + del base_parts[-1] + + # for rfc3986, ignore all base path should the first character be root. + if path[:1] == '/': + segments = path.split('/') + else: + segments = base_parts + path.split('/') + # filter out elements that would cause redundant slashes on re-joining + # the resolved_path + segments[1:-1] = filter(None, segments[1:-1]) + + resolved_path = [] + + for seg in segments: + if seg == '..': + try: + resolved_path.pop() + except IndexError: + # ignore any .. segments that would otherwise cause an IndexError + # when popped from resolved_path if resolving for rfc3986 + pass + elif seg == '.': + continue + else: + resolved_path.append(seg) + + if segments[-1] in ('.', '..'): + # do some post-processing here. if the last segment was a relative dir, + # then we need to append the trailing '/' + resolved_path.append('') + + return _coerce_result(urlunparse((scheme, netloc, '/'.join( + resolved_path) or '/', params, query, fragment))) + + +def urldefrag(url): + """Removes any existing fragment from URL. + + Returns a tuple of the defragmented URL and the fragment. If + the URL contained no fragments, the second element is the + empty string. + """ + url, _coerce_result = _coerce_args(url) + if '#' in url: + s, n, p, a, q, frag = urlparse(url) + defrag = urlunparse((s, n, p, a, q, '')) + else: + frag = '' + defrag = url + return _coerce_result(DefragResult(defrag, frag)) + +_hexdig = '0123456789ABCDEFabcdef' +_hextobyte = None + +def unquote_to_bytes(string): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + if not string: + # Is it a string-like object? + string.split + return b'' + if isinstance(string, str): + string = string.encode('utf-8') + bits = string.split(b'%') + if len(bits) == 1: + return string + res = [bits[0]] + append = res.append + # Delay the initialization of the table to not waste memory + # if the function is never called + global _hextobyte + if _hextobyte is None: + _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)]) + for a in _hexdig for b in _hexdig} + for item in bits[1:]: + try: + append(_hextobyte[item[:2]]) + append(item[2:]) + except KeyError: + append(b'%') + append(item) + return b''.join(res) + +_asciire = re.compile('([\x00-\x7f]+)') + +def unquote(string, encoding='utf-8', errors='replace'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid + sequences are replaced by a placeholder character. + + unquote('abc%20def') -> 'abc def'. + """ + if '%' not in string: + string.split + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + bits = _asciire.split(string) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(unquote_to_bytes(bits[i]).decode(encoding, errors)) + append(bits[i + 1]) + return ''.join(res) + +def parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + """Parse a query given as a string argument. + + Arguments: + + qs: percent-encoded query string to be parsed + + keep_blank_values: flag indicating whether blank values in + percent-encoded queries should be treated as blank strings. + A true value indicates that blanks should be retained as + blank strings. The default false value indicates that + blank values are to be ignored and treated as if they were + not included. + + strict_parsing: flag indicating what to do with parsing errors. + If false (the default), errors are silently ignored. + If true, errors raise a ValueError exception. + + encoding and errors: specify how to decode percent-encoded sequences + into Unicode characters, as accepted by the bytes.decode() method. + """ + parsed_result = {} + pairs = parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + +def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + """Parse a query given as a string argument. + + Arguments: + + qs: percent-encoded query string to be parsed + + keep_blank_values: flag indicating whether blank values in + percent-encoded queries should be treated as blank strings. A + true value indicates that blanks should be retained as blank + strings. The default false value indicates that blank values + are to be ignored and treated as if they were not included. + + strict_parsing: flag indicating what to do with parsing errors. If + false (the default), errors are silently ignored. If true, + errors raise a ValueError exception. + + encoding and errors: specify how to decode percent-encoded sequences + into Unicode characters, as accepted by the bytes.decode() method. + + Returns a list, as G-d intended. + """ + qs, _coerce_result = _coerce_args(qs) + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError("bad query field: %r" % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = unquote(name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = unquote(value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + +def unquote_plus(string, encoding='utf-8', errors='replace'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + + unquote_plus('%7e/abc+def') -> '~/abc def' + """ + string = string.replace('+', ' ') + return unquote(string, encoding, errors) + +_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + b'abcdefghijklmnopqrstuvwxyz' + b'0123456789' + b'_.-') +_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) +_safe_quoters = {} + +class Quoter(collections.defaultdict): + """A mapping from bytes (in range(0,256)) to strings. + + String values are percent-encoded byte values, unless the key < 128, and + in the "safe" set (either the specified safe set, or default set). + """ + # Keeps a cache internally, using defaultdict, for efficiency (lookups + # of cached keys don't call Python code at all). + def __init__(self, safe): + """safe: bytes object.""" + self.safe = _ALWAYS_SAFE.union(safe) + + def __repr__(self): + # Without this, will just display as a defaultdict + return "<%s %r>" % (self.__class__.__name__, dict(self)) + + def __missing__(self, b): + # Handle a cache miss. Store quoted string in cache and return. + res = chr(b) if b in self.safe else '%{:02X}'.format(b) + self[b] = res + return res + +def quote(string, safe='/', encoding=None, errors=None): + """quote('abc def') -> 'abc%20def' + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + + string and safe may be either str or bytes objects. encoding and errors + must not be specified if string is a bytes object. + + The optional encoding and errors parameters specify how to deal with + non-ASCII characters, as accepted by the str.encode method. + By default, encoding='utf-8' (characters are encoded with UTF-8), and + errors='strict' (unsupported characters raise a UnicodeEncodeError). + """ + if isinstance(string, str): + if not string: + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'strict' + string = string.encode(encoding, errors) + else: + if encoding is not None: + raise TypeError("quote() doesn't support 'encoding' for bytes") + if errors is not None: + raise TypeError("quote() doesn't support 'errors' for bytes") + return quote_from_bytes(string, safe) + +def quote_plus(string, safe='', encoding=None, errors=None): + """Like quote(), but also replace ' ' with '+', as required for quoting + HTML form values. Plus signs in the original string are escaped unless + they are included in safe. It also does not have safe default to '/'. + """ + # Check if ' ' in string, where string may either be a str or bytes. If + # there are no spaces, the regular quote will produce the right answer. + if ((isinstance(string, str) and ' ' not in string) or + (isinstance(string, bytes) and b' ' not in string)): + return quote(string, safe, encoding, errors) + if isinstance(safe, str): + space = ' ' + else: + space = b' ' + string = quote(string, safe + space, encoding, errors) + return string.replace(' ', '+') + +def quote_from_bytes(bs, safe='/'): + """Like quote(), but accepts a bytes object rather than a str, and does + not perform string-to-bytes encoding. It always returns an ASCII string. + quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' + """ + if not isinstance(bs, (bytes, bytearray)): + raise TypeError("quote_from_bytes() expected bytes") + if not bs: + return '' + if isinstance(safe, str): + # Normalize 'safe' by converting to bytes and removing non-ASCII chars + safe = safe.encode('ascii', 'ignore') + else: + safe = bytes([c for c in safe if c < 128]) + if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): + return bs.decode() + try: + quoter = _safe_quoters[safe] + except KeyError: + _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ + return ''.join([quoter(char) for char in bs]) + +def urlencode(query, doseq=False, safe='', encoding=None, errors=None, + quote_via=quote_plus): + """Encode a dict or sequence of two-element tuples into a URL query string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + + The components of a query arg may each be either a string or a bytes type. + + The safe, encoding, and errors parameters are passed down to the function + specified by quote_via (encoding and errors only if a component is a str). + """ + + if hasattr(query, "items"): + query = query.items() + else: + # It's a bother at times that strings and string-like objects are + # sequences. + try: + # non-sequence items should not work with len() + # non-empty strings will fail this + if len(query) and not isinstance(query[0], tuple): + raise TypeError + # Zero-length sequences of all types will get here and succeed, + # but that's a minor nit. Since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty, va, tb = sys.exc_info() + raise TypeError("not a valid non-string sequence " + "or mapping object").with_traceback(tb) + + l = [] + if not doseq: + for k, v in query: + if isinstance(k, bytes): + k = quote_via(k, safe) + else: + k = quote_via(str(k), safe, encoding, errors) + + if isinstance(v, bytes): + v = quote_via(v, safe) + else: + v = quote_via(str(v), safe, encoding, errors) + l.append(k + '=' + v) + else: + for k, v in query: + if isinstance(k, bytes): + k = quote_via(k, safe) + else: + k = quote_via(str(k), safe, encoding, errors) + + if isinstance(v, bytes): + v = quote_via(v, safe) + l.append(k + '=' + v) + elif isinstance(v, str): + v = quote_via(v, safe, encoding, errors) + l.append(k + '=' + v) + else: + try: + # Is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = quote_via(str(v), safe, encoding, errors) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + if isinstance(elt, bytes): + elt = quote_via(elt, safe) + else: + elt = quote_via(str(elt), safe, encoding, errors) + l.append(k + '=' + elt) + return '&'.join(l) + +def to_bytes(url): + """to_bytes(u"URL") --> 'URL'.""" + # Most URL schemes require ASCII. If that changes, the conversion + # can be relaxed. + # XXX get rid of to_bytes() + if isinstance(url, str): + try: + url = url.encode("ASCII").decode() + except UnicodeError: + raise UnicodeError("URL " + repr(url) + + " contains non-ASCII characters") + return url + +def unwrap(url): + """unwrap('') --> 'type://host/path'.""" + url = str(url).strip() + if url[:1] == '<' and url[-1:] == '>': + url = url[1:-1].strip() + if url[:4] == 'URL:': url = url[4:].strip() + return url + +_typeprog = None +def splittype(url): + """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" + global _typeprog + if _typeprog is None: + _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) + + match = _typeprog.match(url) + if match: + scheme, data = match.groups() + return scheme.lower(), data + return None, url + +_hostprog = None +def splithost(url): + """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" + global _hostprog + if _hostprog is None: + _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL) + + match = _hostprog.match(url) + if match: + host_port, path = match.groups() + if path and path[0] != '/': + path = '/' + path + return host_port, path + return None, url + +def splituser(host): + """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" + user, delim, host = host.rpartition('@') + return (user if delim else None), host + +def splitpasswd(user): + """splitpasswd('user:passwd') -> 'user', 'passwd'.""" + user, delim, passwd = user.partition(':') + return user, (passwd if delim else None) + +# splittag('/path#tag') --> '/path', 'tag' +_portprog = None +def splitport(host): + """splitport('host:port') --> 'host', 'port'.""" + global _portprog + if _portprog is None: + _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) + + match = _portprog.match(host) + if match: + host, port = match.groups() + if port: + return host, port + return host, None + +def splitnport(host, defport=-1): + """Split host and port, returning numeric port. + Return given default port if no ':' found; defaults to -1. + Return numerical port if a valid number are found after ':'. + Return None if ':' but not a valid number.""" + host, delim, port = host.rpartition(':') + if not delim: + host = port + elif port: + try: + nport = int(port) + except ValueError: + nport = None + return host, nport + return host, defport + +def splitquery(url): + """splitquery('/path?query') --> '/path', 'query'.""" + path, delim, query = url.rpartition('?') + if delim: + return path, query + return url, None + +def splittag(url): + """splittag('/path#tag') --> '/path', 'tag'.""" + path, delim, tag = url.rpartition('#') + if delim: + return path, tag + return url, None + +def splitattr(url): + """splitattr('/path;attr1=value1;attr2=value2;...') -> + '/path', ['attr1=value1', 'attr2=value2', ...].""" + words = url.split(';') + return words[0], words[1:] + +def splitvalue(attr): + """splitvalue('attr=value') --> 'attr', 'value'.""" + attr, delim, value = attr.partition('=') + return attr, (value if delim else None) diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py new file mode 100644 index 0000000000..5f15b74f4d --- /dev/null +++ b/Lib/urllib/request.py @@ -0,0 +1,2743 @@ +"""An extensible library for opening URLs using a variety of protocols + +The simplest way to use this module is to call the urlopen function, +which accepts a string containing a URL or a Request object (described +below). It opens the URL and returns the results as file-like +object; the returned object has some extra methods described below. + +The OpenerDirector manages a collection of Handler objects that do +all the actual work. Each Handler implements a particular protocol or +option. The OpenerDirector is a composite object that invokes the +Handlers needed to open the requested URL. For example, the +HTTPHandler performs HTTP GET and POST requests and deals with +non-error returns. The HTTPRedirectHandler automatically deals with +HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler +deals with digest authentication. + +urlopen(url, data=None) -- Basic usage is the same as original +urllib. pass the url and optionally data to post to an HTTP URL, and +get a file-like object back. One difference is that you can also pass +a Request instance instead of URL. Raises a URLError (subclass of +OSError); for HTTP errors, raises an HTTPError, which can also be +treated as a valid response. + +build_opener -- Function that creates a new OpenerDirector instance. +Will install the default handlers. Accepts one or more Handlers as +arguments, either instances or Handler classes that it will +instantiate. If one of the argument is a subclass of the default +handler, the argument will be installed instead of the default. + +install_opener -- Installs a new opener as the default opener. + +objects of interest: + +OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages +the Handler classes, while dealing with requests and responses. + +Request -- An object that encapsulates the state of a request. The +state can be as simple as the URL. It can also include extra HTTP +headers, e.g. a User-Agent. + +BaseHandler -- + +internals: +BaseHandler and parent +_call_chain conventions + +Example usage: + +import urllib.request + +# set up authentication info +authinfo = urllib.request.HTTPBasicAuthHandler() +authinfo.add_password(realm='PDQ Application', + uri='https://mahler:8092/site-updates.py', + user='klem', + passwd='geheim$parole') + +proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) + +# build a new opener that adds authentication and caching FTP handlers +opener = urllib.request.build_opener(proxy_support, authinfo, + urllib.request.CacheFTPHandler) + +# install it +urllib.request.install_opener(opener) + +f = urllib.request.urlopen('http://www.python.org/') +""" + +# XXX issues: +# If an authentication error handler that tries to perform +# authentication for some reason but fails, how should the error be +# signalled? The client needs to know the HTTP error code. But if +# the handler knows that the problem was, e.g., that it didn't know +# that hash algo that requested in the challenge, it would be good to +# pass that information along to the client, too. +# ftp errors aren't handled cleanly +# check digest against correct (i.e. non-apache) implementation + +# Possible extensions: +# complex proxies XXX not sure what exactly was meant by this +# abstract factory for opener + +import base64 +import bisect +import email +import hashlib +import http.client +import io +import os +import posixpath +import re +import socket +import string +import sys +import time +import collections +import tempfile +import contextlib +import warnings + + +from urllib.error import URLError, HTTPError, ContentTooShortError +from urllib.parse import ( + urlparse, urlsplit, urljoin, unwrap, quote, unquote, + splittype, splithost, splitport, splituser, splitpasswd, + splitattr, splitquery, splitvalue, splittag, to_bytes, + unquote_to_bytes, urlunparse) +from urllib.response import addinfourl, addclosehook + +# check for SSL +try: + import ssl +except ImportError: + _have_ssl = False +else: + _have_ssl = True + +__all__ = [ + # Classes + 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', + 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', + 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', + 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', + 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', + 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', + 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', + 'UnknownHandler', 'HTTPErrorProcessor', + # Functions + 'urlopen', 'install_opener', 'build_opener', + 'pathname2url', 'url2pathname', 'getproxies', + # Legacy interface + 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', +] + +# used in User-Agent header sent +__version__ = '%d.%d' % sys.version_info[:2] + +_opener = None +def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + *, cafile=None, capath=None, cadefault=False, context=None): + '''Open the URL url, which can be either a string or a Request object. + + *data* must be an object specifying additional data to be sent to + the server, or None if no such data is needed. See Request for + details. + + urllib.request module uses HTTP/1.1 and includes a "Connection:close" + header in its HTTP requests. + + The optional *timeout* parameter specifies a timeout in seconds for + blocking operations like the connection attempt (if not specified, the + global default timeout setting will be used). This only works for HTTP, + HTTPS and FTP connections. + + If *context* is specified, it must be a ssl.SSLContext instance describing + the various SSL options. See HTTPSConnection for more details. + + The optional *cafile* and *capath* parameters specify a set of trusted CA + certificates for HTTPS requests. cafile should point to a single file + containing a bundle of CA certificates, whereas capath should point to a + directory of hashed certificate files. More information can be found in + ssl.SSLContext.load_verify_locations(). + + The *cadefault* parameter is ignored. + + This function always returns an object which can work as a context + manager and has methods such as + + * geturl() - return the URL of the resource retrieved, commonly used to + determine if a redirect was followed + + * info() - return the meta-information of the page, such as headers, in the + form of an email.message_from_string() instance (see Quick Reference to + HTTP Headers) + + * getcode() - return the HTTP status code of the response. Raises URLError + on errors. + + For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse + object slightly modified. In addition to the three new methods above, the + msg attribute contains the same information as the reason attribute --- + the reason phrase returned by the server --- instead of the response + headers as it is specified in the documentation for HTTPResponse. + + For FTP, file, and data URLs and requests explicitly handled by legacy + URLopener and FancyURLopener classes, this function returns a + urllib.response.addinfourl object. + + Note that None may be returned if no handler handles the request (though + the default installed global OpenerDirector uses UnknownHandler to ensure + this never happens). + + In addition, if proxy settings are detected (for example, when a *_proxy + environment variable like http_proxy is set), ProxyHandler is default + installed and makes sure the requests are handled through the proxy. + + ''' + global _opener + if cafile or capath or cadefault: + import warnings + warnings.warn("cafile, cpath and cadefault are deprecated, use a " + "custom context instead.", DeprecationWarning, 2) + if context is not None: + raise ValueError( + "You can't pass both context and any of cafile, capath, and " + "cadefault" + ) + if not _have_ssl: + raise ValueError('SSL support not available') + context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH, + cafile=cafile, + capath=capath) + https_handler = HTTPSHandler(context=context) + opener = build_opener(https_handler) + elif context: + https_handler = HTTPSHandler(context=context) + opener = build_opener(https_handler) + elif _opener is None: + _opener = opener = build_opener() + else: + opener = _opener + return opener.open(url, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener + +_url_tempfiles = [] +def urlretrieve(url, filename=None, reporthook=None, data=None): + """ + Retrieve a URL into a temporary location on disk. + + Requires a URL argument. If a filename is passed, it is used as + the temporary file location. The reporthook argument should be + a callable that accepts a block number, a read size, and the + total file size of the URL target. The data argument should be + valid URL encoded data. + + If a filename is passed and the URL points to a local resource, + the result is a copy from local file to new file. + + Returns a tuple containing the path to the newly created + data file as well as the resulting HTTPMessage object. + """ + url_type, path = splittype(url) + + with contextlib.closing(urlopen(url, data)) as fp: + headers = fp.info() + + # Just return the local path and the "headers" for file:// + # URLs. No sense in performing a copy unless requested. + if url_type == "file" and not filename: + return os.path.normpath(path), headers + + # Handle temporary file setup. + if filename: + tfp = open(filename, 'wb') + else: + tfp = tempfile.NamedTemporaryFile(delete=False) + filename = tfp.name + _url_tempfiles.append(filename) + + with tfp: + result = filename, headers + bs = 1024*8 + size = -1 + read = 0 + blocknum = 0 + if "content-length" in headers: + size = int(headers["Content-Length"]) + + if reporthook: + reporthook(blocknum, bs, size) + + while True: + block = fp.read(bs) + if not block: + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: got only %i out of %i bytes" + % (read, size), result) + + return result + +def urlcleanup(): + """Clean up temporary files from urlretrieve calls.""" + for temp_file in _url_tempfiles: + try: + os.unlink(temp_file) + except OSError: + pass + + del _url_tempfiles[:] + global _opener + if _opener: + _opener = None + +# copied from cookielib.py +_cut_port_re = re.compile(r":\d+$", re.ASCII) +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.full_url + host = urlparse(url)[1] + if host == "": + host = request.get_header("Host", "") + + # remove port, if present + host = _cut_port_re.sub("", host, 1) + return host.lower() + +class Request: + + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False, + method=None): + self.full_url = url + self.headers = {} + self.unredirected_hdrs = {} + self._data = None + self.data = data + self._tunnel_host = None + for key, value in headers.items(): + self.add_header(key, value) + if origin_req_host is None: + origin_req_host = request_host(self) + self.origin_req_host = origin_req_host + self.unverifiable = unverifiable + if method: + self.method = method + + @property + def full_url(self): + if self.fragment: + return '{}#{}'.format(self._full_url, self.fragment) + return self._full_url + + @full_url.setter + def full_url(self, url): + # unwrap('') --> 'type://host/path' + self._full_url = unwrap(url) + self._full_url, self.fragment = splittag(self._full_url) + self._parse() + + @full_url.deleter + def full_url(self): + self._full_url = None + self.fragment = None + self.selector = '' + + @property + def data(self): + return self._data + + @data.setter + def data(self, data): + if data != self._data: + self._data = data + # issue 16464 + # if we change data we need to remove content-length header + # (cause it's most probably calculated for previous value) + if self.has_header("Content-length"): + self.remove_header("Content-length") + + @data.deleter + def data(self): + self.data = None + + def _parse(self): + self.type, rest = splittype(self._full_url) + if self.type is None: + raise ValueError("unknown url type: %r" % self.full_url) + self.host, self.selector = splithost(rest) + if self.host: + self.host = unquote(self.host) + + def get_method(self): + """Return a string indicating the HTTP request method.""" + default_method = "POST" if self.data is not None else "GET" + return getattr(self, 'method', default_method) + + def get_full_url(self): + return self.full_url + + def set_proxy(self, host, type): + if self.type == 'https' and not self._tunnel_host: + self._tunnel_host = self.host + else: + self.type= type + self.selector = self.full_url + self.host = host + + def has_proxy(self): + return self.selector == self.full_url + + def add_header(self, key, val): + # useful for something like authentication + self.headers[key.capitalize()] = val + + def add_unredirected_header(self, key, val): + # will not be added to a redirected request + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + return (header_name in self.headers or + header_name in self.unredirected_hdrs) + + def get_header(self, header_name, default=None): + return self.headers.get( + header_name, + self.unredirected_hdrs.get(header_name, default)) + + def remove_header(self, header_name): + self.headers.pop(header_name, None) + self.unredirected_hdrs.pop(header_name, None) + + def header_items(self): + hdrs = self.unredirected_hdrs.copy() + hdrs.update(self.headers) + return list(hdrs.items()) + +class OpenerDirector: + def __init__(self): + client_version = "Python-urllib/%s" % __version__ + self.addheaders = [('User-agent', client_version)] + # self.handlers is retained only for backward compatibility + self.handlers = [] + # manage the individual handlers + self.handle_open = {} + self.handle_error = {} + self.process_response = {} + self.process_request = {} + + def add_handler(self, handler): + if not hasattr(handler, "add_parent"): + raise TypeError("expected BaseHandler instance, got %r" % + type(handler)) + + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i+1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = self.handle_error.get(protocol, {}) + self.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = self.handle_open + elif condition == "response": + kind = protocol + lookup = self.process_response + elif condition == "request": + kind = protocol + lookup = self.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + bisect.insort(handlers, handler) + else: + handlers.append(handler) + added = True + + if added: + bisect.insort(self.handlers, handler) + handler.add_parent(self) + + def close(self): + # Only exists for backwards compatibility. + pass + + def _call_chain(self, chain, kind, meth_name, *args): + # Handlers raise an exception if no one else should try to handle + # the request, or return None if they can't but another handler + # could. Otherwise, they return the response. + handlers = chain.get(kind, ()) + for handler in handlers: + func = getattr(handler, meth_name) + result = func(*args) + if result is not None: + return result + + def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + # accept a URL or a Request object + if isinstance(fullurl, str): + req = Request(fullurl, data) + else: + req = fullurl + if data is not None: + req.data = data + + req.timeout = timeout + protocol = req.type + + # pre-process request + meth_name = protocol+"_request" + for processor in self.process_request.get(protocol, []): + meth = getattr(processor, meth_name) + req = meth(req) + + response = self._open(req, data) + + # post-process response + meth_name = protocol+"_response" + for processor in self.process_response.get(protocol, []): + meth = getattr(processor, meth_name) + response = meth(req, response) + + return response + + def _open(self, req, data=None): + result = self._call_chain(self.handle_open, 'default', + 'default_open', req) + if result: + return result + + protocol = req.type + result = self._call_chain(self.handle_open, protocol, protocol + + '_open', req) + if result: + return result + + return self._call_chain(self.handle_open, 'unknown', + 'unknown_open', req) + + def error(self, proto, *args): + if proto in ('http', 'https'): + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = self._call_chain(*args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return self._call_chain(*args) + +# XXX probably also want an abstract factory that knows when it makes +# sense to skip a superclass in favor of a subclass and when it might +# make sense to include both + +def build_opener(*handlers): + """Create an opener object from a list of handlers. + + The opener will use several default handlers, including support + for HTTP, FTP and when applicable HTTPS. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + """ + opener = OpenerDirector() + default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, + HTTPDefaultErrorHandler, HTTPRedirectHandler, + FTPHandler, FileHandler, HTTPErrorProcessor, + DataHandler] + if hasattr(http.client, "HTTPSConnection"): + default_classes.append(HTTPSHandler) + skip = set() + for klass in default_classes: + for check in handlers: + if isinstance(check, type): + if issubclass(check, klass): + skip.add(klass) + elif isinstance(check, klass): + skip.add(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + + for h in handlers: + if isinstance(h, type): + h = h() + opener.add_handler(h) + return opener + +class BaseHandler: + handler_order = 500 + + def add_parent(self, parent): + self.parent = parent + + def close(self): + # Only exists for backwards compatibility + pass + + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # Try to preserve the old behavior of having custom classes + # inserted after default ones (works only for custom user + # classes which are not aware of handler_order). + return True + return self.handler_order < other.handler_order + + +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses.""" + handler_order = 1000 # after all other processing + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if not (200 <= code < 300): + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + +class HTTPDefaultErrorHandler(BaseHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + raise HTTPError(req.full_url, code, msg, hdrs, fp) + +class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop + max_redirections = 10 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise HTTPError(req.full_url, code, msg, headers, fp) + + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib.request, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. + newurl = newurl.replace(' ', '%20') + + CONTENT_HEADERS = ("content-length", "content-type") + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return Request(newurl, + headers=newheaders, + origin_req_host=req.origin_req_host, + unverifiable=True) + + # Implementation note: To avoid the server sending us into an + # infinite loop, the request object needs to track what URLs we + # have already seen. Do this by adding a handler-specific + # attribute to the Request object. + def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. + if "location" in headers: + newurl = headers["location"] + elif "uri" in headers: + newurl = headers["uri"] + else: + return + + # fix a possible malformed URL + urlparts = urlparse(newurl) + + # For security reasons we don't allow redirection to anything other + # than http, https or ftp. + + if urlparts.scheme not in ('http', 'https', 'ftp', ''): + raise HTTPError( + newurl, code, + "%s - Redirection to url '%s' is not allowed" % (msg, newurl), + headers, fp) + + if not urlparts.path and urlparts.netloc: + urlparts = list(urlparts) + urlparts[2] = "/" + newurl = urlunparse(urlparts) + + # http.client.parse_headers() decodes as ISO-8859-1. Recover the + # original bytes and percent-encode non-ASCII bytes, and any special + # characters such as the space. + newurl = quote( + newurl, encoding="iso-8859-1", safe=string.punctuation) + newurl = urljoin(req.full_url, newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(req, fp, code, msg, headers, newurl) + if new is None: + return + + # loop detection + # .redirect_dict has a key url if url was previously visited. + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): + raise HTTPError(req.full_url, code, + self.inf_msg + msg, headers, fp) + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[newurl] = visited.get(newurl, 0) + 1 + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new, timeout=req.timeout) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme. + """ + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + +class ProxyHandler(BaseHandler): + # Proxies must be in front + handler_order = 100 + + def __init__(self, proxies=None): + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'keys'), "proxies must be a mapping" + self.proxies = proxies + for type, url in proxies.items(): + setattr(self, '%s_open' % type, + lambda r, proxy=url, type=type, meth=self.proxy_open: + meth(r, proxy, type)) + + def proxy_open(self, req, proxy, type): + orig_type = req.type + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type + + if req.host and proxy_bypass(req.host): + return None + + if user and password: + user_pass = '%s:%s' % (unquote(user), + unquote(password)) + creds = base64.b64encode(user_pass.encode()).decode("ascii") + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type or orig_type == 'https': + # let other handlers take care of it + return None + else: + # need to start over, because the other handlers don't + # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a + return self.parent.open(req, timeout=req.timeout) + +class HTTPPasswordMgr: + + def __init__(self): + self.passwd = {} + + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if isinstance(uri, str): + uri = [uri] + if realm not in self.passwd: + self.passwd[realm] = {} + for default_port in True, False: + reduced_uri = tuple( + [self.reduce_uri(u, default_port) for u in uri]) + self.passwd[realm][reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + domains = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uris, authinfo in domains.items(): + for uri in uris: + if self.is_suburi(uri, reduced_authuri): + return authinfo + return None, None + + def reduce_uri(self, uri, default_port=True): + """Accept authority or URI and extract only the authority and path.""" + # note HTTP URLs do not have a userinfo component + parts = urlsplit(uri) + if parts[1]: + # URI + scheme = parts[0] + authority = parts[1] + path = parts[2] or '/' + else: + # host or host:port + scheme = None + authority = uri + path = '/' + host, port = splitport(authority) + if default_port and port is None and scheme is not None: + dport = {"http": 80, + "https": 443, + }.get(scheme) + if dport is not None: + authority = "%s:%d" % (host, dport) + return authority, path + + def is_suburi(self, base, test): + """Check if test is below base in a URI tree + + Both args must be URIs in reduced form. + """ + if base == test: + return True + if base[0] != test[0]: + return False + common = posixpath.commonprefix((base[1], test[1])) + if len(common) == len(base[1]): + return True + return False + + +class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): + + def find_user_password(self, realm, authuri): + user, password = HTTPPasswordMgr.find_user_password(self, realm, + authuri) + if user is not None: + return user, password + return HTTPPasswordMgr.find_user_password(self, None, authuri) + + +class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): + + def __init__(self, *args, **kwargs): + self.authenticated = {} + super().__init__(*args, **kwargs) + + def add_password(self, realm, uri, user, passwd, is_authenticated=False): + self.update_authenticated(uri, is_authenticated) + # Add a default for prior auth requests + if realm is not None: + super().add_password(None, uri, user, passwd) + super().add_password(realm, uri, user, passwd) + + def update_authenticated(self, uri, is_authenticated=False): + # uri could be a single URI or a sequence + if isinstance(uri, str): + uri = [uri] + + for default_port in True, False: + for u in uri: + reduced_uri = self.reduce_uri(u, default_port) + self.authenticated[reduced_uri] = is_authenticated + + def is_authenticated(self, authuri): + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri in self.authenticated: + if self.is_suburi(uri, reduced_authuri): + return self.authenticated[uri] + + +class AbstractBasicAuthHandler: + + # XXX this allows for multiple auth-schemes, but will stupidly pick + # the last one with a realm specified. + + # allow for double- and single-quoted realm values + # (single quotes are a violation of the RFC, but appear in the wild) + rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' + 'realm=(["\']?)([^"\']*)\\2', re.I) + + # XXX could pre-emptively send auth info already accepted (RFC 2617, + # end of section 2, and section 1.2 immediately after "credentials" + # production). + + def __init__(self, password_mgr=None): + if password_mgr is None: + password_mgr = HTTPPasswordMgr() + self.passwd = password_mgr + self.add_password = self.passwd.add_password + + def http_error_auth_reqed(self, authreq, host, req, headers): + # host may be an authority (without userinfo) or a URL with an + # authority + # XXX could be multiple headers + authreq = headers.get(authreq, None) + + if authreq: + scheme = authreq.split()[0] + if scheme.lower() != 'basic': + raise ValueError("AbstractBasicAuthHandler does not" + " support the following scheme: '%s'" % + scheme) + else: + mo = AbstractBasicAuthHandler.rx.search(authreq) + if mo: + scheme, quote, realm = mo.groups() + if quote not in ['"',"'"]: + warnings.warn("Basic Auth Realm was unquoted", + UserWarning, 2) + if scheme.lower() == 'basic': + return self.retry_http_basic_auth(host, req, realm) + + def retry_http_basic_auth(self, host, req, realm): + user, pw = self.passwd.find_user_password(realm, host) + if pw is not None: + raw = "%s:%s" % (user, pw) + auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") + if req.get_header(self.auth_header, None) == auth: + return None + req.add_unredirected_header(self.auth_header, auth) + return self.parent.open(req, timeout=req.timeout) + else: + return None + + def http_request(self, req): + if (not hasattr(self.passwd, 'is_authenticated') or + not self.passwd.is_authenticated(req.full_url)): + return req + + if not req.has_header('Authorization'): + user, passwd = self.passwd.find_user_password(None, req.full_url) + credentials = '{0}:{1}'.format(user, passwd).encode() + auth_str = base64.standard_b64encode(credentials).decode() + req.add_unredirected_header('Authorization', + 'Basic {}'.format(auth_str.strip())) + return req + + def http_response(self, req, response): + if hasattr(self.passwd, 'is_authenticated'): + if 200 <= response.code < 300: + self.passwd.update_authenticated(req.full_url, True) + else: + self.passwd.update_authenticated(req.full_url, False) + return response + + https_request = http_request + https_response = http_response + + + +class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Authorization' + + def http_error_401(self, req, fp, code, msg, headers): + url = req.full_url + response = self.http_error_auth_reqed('www-authenticate', + url, req, headers) + return response + + +class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Proxy-authorization' + + def http_error_407(self, req, fp, code, msg, headers): + # http_error_auth_reqed requires that there is no userinfo component in + # authority. Assume there isn't one, since urllib.request does not (and + # should not, RFC 3986 s. 3.2.1) support requests for URLs containing + # userinfo. + authority = req.host + response = self.http_error_auth_reqed('proxy-authenticate', + authority, req, headers) + return response + + +# Return n random bytes. +_randombytes = os.urandom + + +class AbstractDigestAuthHandler: + # Digest authentication is specified in RFC 2617. + + # XXX The client does not inspect the Authentication-Info header + # in a successful response. + + # XXX It should be possible to test this implementation against + # a mock server that just generates a static set of challenges. + + # XXX qop="auth-int" supports is shaky + + def __init__(self, passwd=None): + if passwd is None: + passwd = HTTPPasswordMgr() + self.passwd = passwd + self.add_password = self.passwd.add_password + self.retried = 0 + self.nonce_count = 0 + self.last_nonce = None + + def reset_retry_count(self): + self.retried = 0 + + def http_error_auth_reqed(self, auth_header, host, req, headers): + authreq = headers.get(auth_header, None) + if self.retried > 5: + # Don't fail endlessly - if we failed once, we'll probably + # fail a second time. Hm. Unless the Password Manager is + # prompting for the information. Crap. This isn't great + # but it's better than the current 'repeat until recursion + # depth exceeded' approach + raise HTTPError(req.full_url, 401, "digest auth failed", + headers, None) + else: + self.retried += 1 + if authreq: + scheme = authreq.split()[0] + if scheme.lower() == 'digest': + return self.retry_http_digest_auth(req, authreq) + elif scheme.lower() != 'basic': + raise ValueError("AbstractDigestAuthHandler does not support" + " the following scheme: '%s'" % scheme) + + def retry_http_digest_auth(self, req, auth): + token, challenge = auth.split(' ', 1) + chal = parse_keqv_list(filter(None, parse_http_list(challenge))) + auth = self.get_authorization(req, chal) + if auth: + auth_val = 'Digest %s' % auth + if req.headers.get(self.auth_header, None) == auth_val: + return None + req.add_unredirected_header(self.auth_header, auth_val) + resp = self.parent.open(req, timeout=req.timeout) + return resp + + def get_cnonce(self, nonce): + # The cnonce-value is an opaque + # quoted string value provided by the client and used by both client + # and server to avoid chosen plaintext attacks, to provide mutual + # authentication, and to provide some message integrity protection. + # This isn't a fabulous effort, but it's probably Good Enough. + s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) + b = s.encode("ascii") + _randombytes(8) + dig = hashlib.sha1(b).hexdigest() + return dig[:16] + + def get_authorization(self, req, chal): + try: + realm = chal['realm'] + nonce = chal['nonce'] + qop = chal.get('qop') + algorithm = chal.get('algorithm', 'MD5') + # mod_digest doesn't send an opaque, even though it isn't + # supposed to be optional + opaque = chal.get('opaque', None) + except KeyError: + return None + + H, KD = self.get_algorithm_impls(algorithm) + if H is None: + return None + + user, pw = self.passwd.find_user_password(realm, req.full_url) + if user is None: + return None + + # XXX not implemented yet + if req.data is not None: + entdig = self.get_entity_digest(req.data, chal) + else: + entdig = None + + A1 = "%s:%s:%s" % (user, realm, pw) + A2 = "%s:%s" % (req.get_method(), + # XXX selector: what about proxies and full urls + req.selector) + if qop == 'auth': + if nonce == self.last_nonce: + self.nonce_count += 1 + else: + self.nonce_count = 1 + self.last_nonce = nonce + ncvalue = '%08x' % self.nonce_count + cnonce = self.get_cnonce(nonce) + noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) + respdig = KD(H(A1), noncebit) + elif qop is None: + respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) + else: + # XXX handle auth-int. + raise URLError("qop '%s' is not supported." % qop) + + # XXX should the partial digests be encoded too? + + base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ + 'response="%s"' % (user, realm, nonce, req.selector, + respdig) + if opaque: + base += ', opaque="%s"' % opaque + if entdig: + base += ', digest="%s"' % entdig + base += ', algorithm="%s"' % algorithm + if qop: + base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) + return base + + def get_algorithm_impls(self, algorithm): + # lambdas assume digest modules are imported at the top level + if algorithm == 'MD5': + H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() + elif algorithm == 'SHA': + H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() + # XXX MD5-sess + else: + raise ValueError("Unsupported digest authentication " + "algorithm %r" % algorithm) + KD = lambda s, d: H("%s:%s" % (s, d)) + return H, KD + + def get_entity_digest(self, data, chal): + # XXX not implemented yet + return None + + +class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + """An authentication protocol defined by RFC 2069 + + Digest authentication improves on basic authentication because it + does not transmit passwords in the clear. + """ + + auth_header = 'Authorization' + handler_order = 490 # before Basic auth + + def http_error_401(self, req, fp, code, msg, headers): + host = urlparse(req.full_url)[1] + retry = self.http_error_auth_reqed('www-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + + auth_header = 'Proxy-Authorization' + handler_order = 490 # before Basic auth + + def http_error_407(self, req, fp, code, msg, headers): + host = req.host + retry = self.http_error_auth_reqed('proxy-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + +class AbstractHTTPHandler(BaseHandler): + + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def _get_content_length(self, request): + return http.client.HTTPConnection._get_content_length( + request.data, + request.get_method()) + + def do_request_(self, request): + host = request.host + if not host: + raise URLError('no host given') + + if request.data is not None: # POST + data = request.data + if isinstance(data, str): + msg = "POST data should be bytes, an iterable of bytes, " \ + "or a file object. It cannot be of type str." + raise TypeError(msg) + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if (not request.has_header('Content-length') + and not request.has_header('Transfer-encoding')): + content_length = self._get_content_length(request) + if content_length is not None: + request.add_unredirected_header( + 'Content-length', str(content_length)) + else: + request.add_unredirected_header( + 'Transfer-encoding', 'chunked') + + sel_host = host + if request.has_proxy(): + scheme, sel = splittype(request.selector) + sel_host, sel_path = splithost(sel) + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + + def do_open(self, http_class, req, **http_conn_args): + """Return an HTTPResponse object for the request, using http_class. + + http_class must implement the HTTPConnection API from http.client. + """ + host = req.host + if not host: + raise URLError('no host given') + + # will parse host:port + h = http_class(host, timeout=req.timeout, **http_conn_args) + h.set_debuglevel(self._debuglevel) + + headers = dict(req.unredirected_hdrs) + headers.update(dict((k, v) for k, v in req.headers.items() + if k not in headers)) + + # TODO(jhylton): Should this be redesigned to handle + # persistent connections? + + # We want to make an HTTP/1.1 request, but the addinfourl + # class isn't prepared to deal with a persistent connection. + # It will try to read all remaining data from the socket, + # which will block while the server waits for the next request. + # So make sure the connection gets closed after the (only) + # request. + headers["Connection"] = "close" + headers = dict((name.title(), val) for name, val in headers.items()) + + if req._tunnel_host: + tunnel_headers = {} + proxy_auth_hdr = "Proxy-Authorization" + if proxy_auth_hdr in headers: + tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] + # Proxy-Authorization should not be sent to origin + # server. + del headers[proxy_auth_hdr] + h.set_tunnel(req._tunnel_host, headers=tunnel_headers) + + try: + try: + h.request(req.get_method(), req.selector, req.data, headers, + encode_chunked=req.has_header('Transfer-encoding')) + except OSError as err: # timeout error + raise URLError(err) + r = h.getresponse() + except: + h.close() + raise + + # If the server does not send us a 'Connection: close' header, + # HTTPConnection assumes the socket should be left open. Manually + # mark the socket to be closed when this response object goes away. + if h.sock: + h.sock.close() + h.sock = None + + r.url = req.get_full_url() + # This line replaces the .msg attribute of the HTTPResponse + # with .headers, because urllib clients expect the response to + # have the reason in .msg. It would be good to mark this + # attribute is deprecated and get then to use info() or + # .headers. + r.msg = r.reason + return r + + +class HTTPHandler(AbstractHTTPHandler): + + def http_open(self, req): + return self.do_open(http.client.HTTPConnection, req) + + http_request = AbstractHTTPHandler.do_request_ + +if hasattr(http.client, 'HTTPSConnection'): + + class HTTPSHandler(AbstractHTTPHandler): + + def __init__(self, debuglevel=0, context=None, check_hostname=None): + AbstractHTTPHandler.__init__(self, debuglevel) + self._context = context + self._check_hostname = check_hostname + + def https_open(self, req): + return self.do_open(http.client.HTTPSConnection, req, + context=self._context, check_hostname=self._check_hostname) + + https_request = AbstractHTTPHandler.do_request_ + + __all__.append('HTTPSHandler') + +class HTTPCookieProcessor(BaseHandler): + def __init__(self, cookiejar=None): + import http.cookiejar + if cookiejar is None: + cookiejar = http.cookiejar.CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + +class UnknownHandler(BaseHandler): + def unknown_open(self, req): + type = req.type + raise URLError('unknown url type: %s' % type) + +def parse_keqv_list(l): + """Parse list of key=value strings where keys are not duplicated.""" + parsed = {} + for elt in l: + k, v = elt.split('=', 1) + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + parsed[k] = v + return parsed + +def parse_http_list(s): + """Parse lists as described by RFC 2068 Section 2. + + In particular, parse comma-separated lists where the elements of + the list may include quoted-strings. A quoted-string could + contain a comma. A non-quoted string could have quotes in the + middle. Neither commas nor quotes count if they are escaped. + Only double-quotes count, not single-quotes. + """ + res = [] + part = '' + + escape = quote = False + for cur in s: + if escape: + part += cur + escape = False + continue + if quote: + if cur == '\\': + escape = True + continue + elif cur == '"': + quote = False + part += cur + continue + + if cur == ',': + res.append(part) + part = '' + continue + + if cur == '"': + quote = True + + part += cur + + # append last part + if part: + res.append(part) + + return [part.strip() for part in res] + +class FileHandler(BaseHandler): + # Use local file or FTP depending on form of URL + def file_open(self, req): + url = req.selector + if url[:2] == '//' and url[2:3] != '/' and (req.host and + req.host != 'localhost'): + if not req.host in self.get_names(): + raise URLError("file:// scheme is supported only on localhost") + else: + return self.open_local_file(req) + + # names for the localhost + names = None + def get_names(self): + if FileHandler.names is None: + try: + FileHandler.names = tuple( + socket.gethostbyname_ex('localhost')[2] + + socket.gethostbyname_ex(socket.gethostname())[2]) + except socket.gaierror: + FileHandler.names = (socket.gethostbyname('localhost'),) + return FileHandler.names + + # not entirely sure what the rules are here + def open_local_file(self, req): + import email.utils + import mimetypes + host = req.host + filename = req.selector + localfile = url2pathname(filename) + try: + stats = os.stat(localfile) + size = stats.st_size + modified = email.utils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(filename)[0] + headers = email.message_from_string( + 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified)) + if host: + host, port = splitport(host) + if not host or \ + (not port and _safe_gethostbyname(host) in self.get_names()): + if host: + origurl = 'file://' + host + filename + else: + origurl = 'file://' + filename + return addinfourl(open(localfile, 'rb'), headers, origurl) + except OSError as exp: + # users shouldn't expect OSErrors coming from urlopen() + raise URLError(exp) + raise URLError('file not on local host') + +def _safe_gethostbyname(host): + try: + return socket.gethostbyname(host) + except socket.gaierror: + return None + +class FTPHandler(BaseHandler): + def ftp_open(self, req): + import ftplib + import mimetypes + host = req.host + if not host: + raise URLError('ftp error: no host given') + host, port = splitport(host) + if port is None: + port = ftplib.FTP_PORT + else: + port = int(port) + + # username/password handling + user, host = splituser(host) + if user: + user, passwd = splitpasswd(user) + else: + passwd = None + host = unquote(host) + user = user or '' + passwd = passwd or '' + + try: + host = socket.gethostbyname(host) + except OSError as msg: + raise URLError(msg) + path, attrs = splitattr(req.selector) + dirs = path.split('/') + dirs = list(map(unquote, dirs)) + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: + dirs = dirs[1:] + try: + fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) + type = file and 'I' or 'D' + for attr in attrs: + attr, value = splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + fp, retrlen = fw.retrfile(file, type) + headers = "" + mtype = mimetypes.guess_type(req.full_url)[0] + if mtype: + headers += "Content-type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-length: %d\n" % retrlen + headers = email.message_from_string(headers) + return addinfourl(fp, headers, req.full_url) + except ftplib.all_errors as exp: + exc = URLError('ftp error: %r' % exp) + raise exc.with_traceback(sys.exc_info()[2]) + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + return ftpwrapper(user, passwd, host, port, dirs, timeout, + persistent=False) + +class CacheFTPHandler(FTPHandler): + # XXX would be nice to have pluggable cache strategies + # XXX this stuff is definitely not thread safe + def __init__(self): + self.cache = {} + self.timeout = {} + self.soonest = 0 + self.delay = 60 + self.max_conns = 16 + + def setTimeout(self, t): + self.delay = t + + def setMaxConns(self, m): + self.max_conns = m + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + key = user, host, port, '/'.join(dirs), timeout + if key in self.cache: + self.timeout[key] = time.time() + self.delay + else: + self.cache[key] = ftpwrapper(user, passwd, host, port, + dirs, timeout) + self.timeout[key] = time.time() + self.delay + self.check_cache() + return self.cache[key] + + def check_cache(self): + # first check for old ones + t = time.time() + if self.soonest <= t: + for k, v in list(self.timeout.items()): + if v < t: + self.cache[k].close() + del self.cache[k] + del self.timeout[k] + self.soonest = min(list(self.timeout.values())) + + # then check the size + if len(self.cache) == self.max_conns: + for k, v in list(self.timeout.items()): + if v == self.soonest: + del self.cache[k] + del self.timeout[k] + break + self.soonest = min(list(self.timeout.values())) + + def clear_cache(self): + for conn in self.cache.values(): + conn.close() + self.cache.clear() + self.timeout.clear() + +class DataHandler(BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.full_url + + scheme, data = url.split(":",1) + mediatype, data = data.split(",",1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = unquote_to_bytes(data) + if mediatype.endswith(";base64"): + data = base64.decodebytes(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = "text/plain;charset=US-ASCII" + + headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % + (mediatype, len(data))) + + return addinfourl(io.BytesIO(data), headers, url) + + +# Code move from the old urllib module + +MAXFTPCACHE = 10 # Trim the ftp cache beyond this size + +# Helper for non-unix systems +if os.name == 'nt': + from nturl2path import url2pathname, pathname2url +else: + def url2pathname(pathname): + """OS-specific conversion from a relative URL of the 'file' scheme + to a file system path; not recommended for general use.""" + return unquote(pathname) + + def pathname2url(pathname): + """OS-specific conversion from a file system path to a relative URL + of the 'file' scheme; not recommended for general use.""" + return quote(pathname) + +# This really consists of two pieces: +# (1) a class which handles opening of all sorts of URLs +# (plus assorted utilities etc.) +# (2) a set of functions for parsing URLs +# XXX Should these be separated out into different modules? + + +ftpcache = {} +class URLopener: + """Class to open URLs. + This is a class rather than just a subroutine because we may need + more than one set of global protocol-specific options. + Note -- this is a base class for those who don't want the + automatic handling of errors type 302 (relocated) and 401 + (authorization needed).""" + + __tempfiles = None + + version = "Python-urllib/%s" % __version__ + + # Constructor + def __init__(self, proxies=None, **x509): + msg = "%(class)s style of invoking requests is deprecated. " \ + "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} + warnings.warn(msg, DeprecationWarning, stacklevel=3) + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'keys'), "proxies must be a mapping" + self.proxies = proxies + self.key_file = x509.get('key_file') + self.cert_file = x509.get('cert_file') + self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')] + self.__tempfiles = [] + self.__unlink = os.unlink # See cleanup() + self.tempcache = None + # Undocumented feature: if you assign {} to tempcache, + # it is used to cache files retrieved with + # self.retrieve(). This is not enabled by default + # since it does not work for changing documents (and I + # haven't got the logic to check expiration headers + # yet). + self.ftpcache = ftpcache + # Undocumented feature: you can use a different + # ftp cache by assigning to the .ftpcache member; + # in case you want logically independent URL openers + # XXX This is not threadsafe. Bah. + + def __del__(self): + self.close() + + def close(self): + self.cleanup() + + def cleanup(self): + # This code sometimes runs when the rest of this module + # has already been deleted, so it can't use any globals + # or import anything. + if self.__tempfiles: + for file in self.__tempfiles: + try: + self.__unlink(file) + except OSError: + pass + del self.__tempfiles[:] + if self.tempcache: + self.tempcache.clear() + + def addheader(self, *args): + """Add a header to be used by the HTTP interface only + e.g. u.addheader('Accept', 'sound/basic')""" + self.addheaders.append(args) + + # External interface + def open(self, fullurl, data=None): + """Use URLopener().open(file) instead of open(file, 'r').""" + fullurl = unwrap(to_bytes(fullurl)) + fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") + if self.tempcache and fullurl in self.tempcache: + filename, headers = self.tempcache[fullurl] + fp = open(filename, 'rb') + return addinfourl(fp, headers, fullurl) + urltype, url = splittype(fullurl) + if not urltype: + urltype = 'file' + if urltype in self.proxies: + proxy = self.proxies[urltype] + urltype, proxyhost = splittype(proxy) + host, selector = splithost(proxyhost) + url = (host, fullurl) # Signal special case to open_*() + else: + proxy = None + name = 'open_' + urltype + self.type = urltype + name = name.replace('-', '_') + if not hasattr(self, name): + if proxy: + return self.open_unknown_proxy(proxy, fullurl, data) + else: + return self.open_unknown(fullurl, data) + try: + if data is None: + return getattr(self, name)(url) + else: + return getattr(self, name)(url, data) + except (HTTPError, URLError): + raise + except OSError as msg: + raise OSError('socket error', msg).with_traceback(sys.exc_info()[2]) + + def open_unknown(self, fullurl, data=None): + """Overridable interface to open unknown URL type.""" + type, url = splittype(fullurl) + raise OSError('url error', 'unknown url type', type) + + def open_unknown_proxy(self, proxy, fullurl, data=None): + """Overridable interface to open unknown URL type.""" + type, url = splittype(fullurl) + raise OSError('url error', 'invalid proxy for %s' % type, proxy) + + # External interface + def retrieve(self, url, filename=None, reporthook=None, data=None): + """retrieve(url) returns (filename, headers) for a local object + or (tempfilename, headers) for a remote object.""" + url = unwrap(to_bytes(url)) + if self.tempcache and url in self.tempcache: + return self.tempcache[url] + type, url1 = splittype(url) + if filename is None and (not type or type == 'file'): + try: + fp = self.open_local_file(url1) + hdrs = fp.info() + fp.close() + return url2pathname(splithost(url1)[1]), hdrs + except OSError as msg: + pass + fp = self.open(url, data) + try: + headers = fp.info() + if filename: + tfp = open(filename, 'wb') + else: + import tempfile + garbage, path = splittype(url) + garbage, path = splithost(path or "") + path, garbage = splitquery(path or "") + path, garbage = splitattr(path or "") + suffix = os.path.splitext(path)[1] + (fd, filename) = tempfile.mkstemp(suffix) + self.__tempfiles.append(filename) + tfp = os.fdopen(fd, 'wb') + try: + result = filename, headers + if self.tempcache is not None: + self.tempcache[url] = result + bs = 1024*8 + size = -1 + read = 0 + blocknum = 0 + if "content-length" in headers: + size = int(headers["Content-Length"]) + if reporthook: + reporthook(blocknum, bs, size) + while 1: + block = fp.read(bs) + if not block: + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + finally: + tfp.close() + finally: + fp.close() + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: got only %i out of %i bytes" + % (read, size), result) + + return result + + # Each method named open_ knows how to open that type of URL + + def _open_generic_http(self, connection_factory, url, data): + """Make an HTTP connection using connection_class. + + This is an internal method that should be called from + open_http() or open_https(). + + Arguments: + - connection_factory should take a host name and return an + HTTPConnection instance. + - url is the url to retrieval or a host, relative-path pair. + - data is payload for a POST request or None. + """ + + user_passwd = None + proxy_passwd= None + if isinstance(url, str): + host, selector = splithost(url) + if host: + user_passwd, host = splituser(host) + host = unquote(host) + realhost = host + else: + host, selector = url + # check whether the proxy contains authorization information + proxy_passwd, host = splituser(host) + # now we proceed with the url we want to obtain + urltype, rest = splittype(selector) + url = rest + user_passwd = None + if urltype.lower() != 'http': + realhost = None + else: + realhost, rest = splithost(rest) + if realhost: + user_passwd, realhost = splituser(realhost) + if user_passwd: + selector = "%s://%s%s" % (urltype, realhost, rest) + if proxy_bypass(realhost): + host = realhost + + if not host: raise OSError('http error', 'no host given') + + if proxy_passwd: + proxy_passwd = unquote(proxy_passwd) + proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') + else: + proxy_auth = None + + if user_passwd: + user_passwd = unquote(user_passwd) + auth = base64.b64encode(user_passwd.encode()).decode('ascii') + else: + auth = None + http_conn = connection_factory(host) + headers = {} + if proxy_auth: + headers["Proxy-Authorization"] = "Basic %s" % proxy_auth + if auth: + headers["Authorization"] = "Basic %s" % auth + if realhost: + headers["Host"] = realhost + + # Add Connection:close as we don't support persistent connections yet. + # This helps in closing the socket and avoiding ResourceWarning + + headers["Connection"] = "close" + + for header, value in self.addheaders: + headers[header] = value + + if data is not None: + headers["Content-Type"] = "application/x-www-form-urlencoded" + http_conn.request("POST", selector, data, headers) + else: + http_conn.request("GET", selector, headers=headers) + + try: + response = http_conn.getresponse() + except http.client.BadStatusLine: + # something went wrong with the HTTP status line + raise URLError("http protocol error: bad status line") + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if 200 <= response.status < 300: + return addinfourl(response, response.msg, "http:" + url, + response.status) + else: + return self.http_error( + url, response.fp, + response.status, response.reason, response.msg, data) + + def open_http(self, url, data=None): + """Use HTTP protocol.""" + return self._open_generic_http(http.client.HTTPConnection, url, data) + + def http_error(self, url, fp, errcode, errmsg, headers, data=None): + """Handle http errors. + + Derived class can override this, or provide specific handlers + named http_error_DDD where DDD is the 3-digit error code.""" + # First check if there's a specific handler for this error + name = 'http_error_%d' % errcode + if hasattr(self, name): + method = getattr(self, name) + if data is None: + result = method(url, fp, errcode, errmsg, headers) + else: + result = method(url, fp, errcode, errmsg, headers, data) + if result: return result + return self.http_error_default(url, fp, errcode, errmsg, headers) + + def http_error_default(self, url, fp, errcode, errmsg, headers): + """Default error handler: close the connection and raise OSError.""" + fp.close() + raise HTTPError(url, errcode, errmsg, headers, None) + + if _have_ssl: + def _https_connection(self, host): + return http.client.HTTPSConnection(host, + key_file=self.key_file, + cert_file=self.cert_file) + + def open_https(self, url, data=None): + """Use HTTPS protocol.""" + return self._open_generic_http(self._https_connection, url, data) + + def open_file(self, url): + """Use local file or FTP depending on form of URL.""" + if not isinstance(url, str): + raise URLError('file error: proxy support for file protocol currently not implemented') + if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': + raise ValueError("file:// scheme is supported only on localhost") + else: + return self.open_local_file(url) + + def open_local_file(self, url): + """Use local file.""" + import email.utils + import mimetypes + host, file = splithost(url) + localname = url2pathname(file) + try: + stats = os.stat(localname) + except OSError as e: + raise URLError(e.strerror, e.filename) + size = stats.st_size + modified = email.utils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(url)[0] + headers = email.message_from_string( + 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified)) + if not host: + urlfile = file + if file[:1] == '/': + urlfile = 'file://' + file + return addinfourl(open(localname, 'rb'), headers, urlfile) + host, port = splitport(host) + if (not port + and socket.gethostbyname(host) in ((localhost(),) + thishost())): + urlfile = file + if file[:1] == '/': + urlfile = 'file://' + file + elif file[:2] == './': + raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) + return addinfourl(open(localname, 'rb'), headers, urlfile) + raise URLError('local file error: not on local host') + + def open_ftp(self, url): + """Use FTP protocol.""" + if not isinstance(url, str): + raise URLError('ftp error: proxy support for ftp protocol currently not implemented') + import mimetypes + host, path = splithost(url) + if not host: raise URLError('ftp error: no host given') + host, port = splitport(host) + user, host = splituser(host) + if user: user, passwd = splitpasswd(user) + else: passwd = None + host = unquote(host) + user = unquote(user or '') + passwd = unquote(passwd or '') + host = socket.gethostbyname(host) + if not port: + import ftplib + port = ftplib.FTP_PORT + else: + port = int(port) + path, attrs = splitattr(path) + path = unquote(path) + dirs = path.split('/') + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: dirs = dirs[1:] + if dirs and not dirs[0]: dirs[0] = '/' + key = user, host, port, '/'.join(dirs) + # XXX thread unsafe! + if len(self.ftpcache) > MAXFTPCACHE: + # Prune the cache, rather arbitrarily + for k in list(self.ftpcache): + if k != key: + v = self.ftpcache[k] + del self.ftpcache[k] + v.close() + try: + if key not in self.ftpcache: + self.ftpcache[key] = \ + ftpwrapper(user, passwd, host, port, dirs) + if not file: type = 'D' + else: type = 'I' + for attr in attrs: + attr, value = splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + (fp, retrlen) = self.ftpcache[key].retrfile(file, type) + mtype = mimetypes.guess_type("ftp:" + url)[0] + headers = "" + if mtype: + headers += "Content-Type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-Length: %d\n" % retrlen + headers = email.message_from_string(headers) + return addinfourl(fp, headers, "ftp:" + url) + except ftperrors() as exp: + raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2]) + + def open_data(self, url, data=None): + """Use "data" URL.""" + if not isinstance(url, str): + raise URLError('data error: proxy support for data protocol currently not implemented') + # ignore POSTed data + # + # syntax of data URLs: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + try: + [type, data] = url.split(',', 1) + except ValueError: + raise OSError('data error', 'bad data URL') + if not type: + type = 'text/plain;charset=US-ASCII' + semi = type.rfind(';') + if semi >= 0 and '=' not in type[semi:]: + encoding = type[semi+1:] + type = type[:semi] + else: + encoding = '' + msg = [] + msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', + time.gmtime(time.time()))) + msg.append('Content-type: %s' % type) + if encoding == 'base64': + # XXX is this encoding/decoding ok? + data = base64.decodebytes(data.encode('ascii')).decode('latin-1') + else: + data = unquote(data) + msg.append('Content-Length: %d' % len(data)) + msg.append('') + msg.append(data) + msg = '\n'.join(msg) + headers = email.message_from_string(msg) + f = io.StringIO(msg) + #f.fileno = None # needed for addinfourl + return addinfourl(f, headers, url) + + +class FancyURLopener(URLopener): + """Derived class with handlers for errors we can handle (perhaps).""" + + def __init__(self, *args, **kwargs): + URLopener.__init__(self, *args, **kwargs) + self.auth_cache = {} + self.tries = 0 + self.maxtries = 10 + + def http_error_default(self, url, fp, errcode, errmsg, headers): + """Default error handling -- don't raise an exception.""" + return addinfourl(fp, headers, "http:" + url, errcode) + + def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): + """Error 302 -- relocated (temporarily).""" + self.tries += 1 + try: + if self.maxtries and self.tries >= self.maxtries: + if hasattr(self, "http_error_500"): + meth = self.http_error_500 + else: + meth = self.http_error_default + return meth(url, fp, 500, + "Internal Server Error: Redirect Recursion", + headers) + result = self.redirect_internal(url, fp, errcode, errmsg, + headers, data) + return result + finally: + self.tries = 0 + + def redirect_internal(self, url, fp, errcode, errmsg, headers, data): + if 'location' in headers: + newurl = headers['location'] + elif 'uri' in headers: + newurl = headers['uri'] + else: + return + fp.close() + + # In case the server sent a relative URL, join with original: + newurl = urljoin(self.type + ":" + url, newurl) + + urlparts = urlparse(newurl) + + # For security reasons, we don't allow redirection to anything other + # than http, https and ftp. + + # We are using newer HTTPError with older redirect_internal method + # This older method will get deprecated in 3.3 + + if urlparts.scheme not in ('http', 'https', 'ftp', ''): + raise HTTPError(newurl, errcode, + errmsg + + " Redirection to url '%s' is not allowed." % newurl, + headers, fp) + + return self.open(newurl) + + def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): + """Error 301 -- also relocated (permanently).""" + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + + def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): + """Error 303 -- also relocated (essentially identical to 302).""" + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + + def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): + """Error 307 -- relocated, but turn POST into error.""" + if data is None: + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + else: + return self.http_error_default(url, fp, errcode, errmsg, headers) + + def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, + retry=False): + """Error 401 -- authentication required. + This function supports Basic authentication only.""" + if 'www-authenticate' not in headers: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + stuff = headers['www-authenticate'] + match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) + if not match: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + scheme, realm = match.groups() + if scheme.lower() != 'basic': + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + if not retry: + URLopener.http_error_default(self, url, fp, errcode, errmsg, + headers) + name = 'retry_' + self.type + '_basic_auth' + if data is None: + return getattr(self,name)(url, realm) + else: + return getattr(self,name)(url, realm, data) + + def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, + retry=False): + """Error 407 -- proxy authentication required. + This function supports Basic authentication only.""" + if 'proxy-authenticate' not in headers: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + stuff = headers['proxy-authenticate'] + match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) + if not match: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + scheme, realm = match.groups() + if scheme.lower() != 'basic': + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + if not retry: + URLopener.http_error_default(self, url, fp, errcode, errmsg, + headers) + name = 'retry_proxy_' + self.type + '_basic_auth' + if data is None: + return getattr(self,name)(url, realm) + else: + return getattr(self,name)(url, realm, data) + + def retry_proxy_http_basic_auth(self, url, realm, data=None): + host, selector = splithost(url) + newurl = 'http://' + host + selector + proxy = self.proxies['http'] + urltype, proxyhost = splittype(proxy) + proxyhost, proxyselector = splithost(proxyhost) + i = proxyhost.find('@') + 1 + proxyhost = proxyhost[i:] + user, passwd = self.get_user_passwd(proxyhost, realm, i) + if not (user or passwd): return None + proxyhost = "%s:%s@%s" % (quote(user, safe=''), + quote(passwd, safe=''), proxyhost) + self.proxies['http'] = 'http://' + proxyhost + proxyselector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_proxy_https_basic_auth(self, url, realm, data=None): + host, selector = splithost(url) + newurl = 'https://' + host + selector + proxy = self.proxies['https'] + urltype, proxyhost = splittype(proxy) + proxyhost, proxyselector = splithost(proxyhost) + i = proxyhost.find('@') + 1 + proxyhost = proxyhost[i:] + user, passwd = self.get_user_passwd(proxyhost, realm, i) + if not (user or passwd): return None + proxyhost = "%s:%s@%s" % (quote(user, safe=''), + quote(passwd, safe=''), proxyhost) + self.proxies['https'] = 'https://' + proxyhost + proxyselector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_http_basic_auth(self, url, realm, data=None): + host, selector = splithost(url) + i = host.find('@') + 1 + host = host[i:] + user, passwd = self.get_user_passwd(host, realm, i) + if not (user or passwd): return None + host = "%s:%s@%s" % (quote(user, safe=''), + quote(passwd, safe=''), host) + newurl = 'http://' + host + selector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_https_basic_auth(self, url, realm, data=None): + host, selector = splithost(url) + i = host.find('@') + 1 + host = host[i:] + user, passwd = self.get_user_passwd(host, realm, i) + if not (user or passwd): return None + host = "%s:%s@%s" % (quote(user, safe=''), + quote(passwd, safe=''), host) + newurl = 'https://' + host + selector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def get_user_passwd(self, host, realm, clear_cache=0): + key = realm + '@' + host.lower() + if key in self.auth_cache: + if clear_cache: + del self.auth_cache[key] + else: + return self.auth_cache[key] + user, passwd = self.prompt_user_passwd(host, realm) + if user or passwd: self.auth_cache[key] = (user, passwd) + return user, passwd + + def prompt_user_passwd(self, host, realm): + """Override this in a GUI environment!""" + import getpass + try: + user = input("Enter username for %s at %s: " % (realm, host)) + passwd = getpass.getpass("Enter password for %s in %s at %s: " % + (user, realm, host)) + return user, passwd + except KeyboardInterrupt: + print() + return None, None + + +# Utility functions + +_localhost = None +def localhost(): + """Return the IP address of the magic hostname 'localhost'.""" + global _localhost + if _localhost is None: + _localhost = socket.gethostbyname('localhost') + return _localhost + +_thishost = None +def thishost(): + """Return the IP addresses of the current host.""" + global _thishost + if _thishost is None: + try: + _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) + except socket.gaierror: + _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) + return _thishost + +_ftperrors = None +def ftperrors(): + """Return the set of errors raised by the FTP class.""" + global _ftperrors + if _ftperrors is None: + import ftplib + _ftperrors = ftplib.all_errors + return _ftperrors + +_noheaders = None +def noheaders(): + """Return an empty email Message object.""" + global _noheaders + if _noheaders is None: + _noheaders = email.message_from_string("") + return _noheaders + + +# Utility classes + +class ftpwrapper: + """Class used by open_ftp() for cache of open FTP connections.""" + + def __init__(self, user, passwd, host, port, dirs, timeout=None, + persistent=True): + self.user = user + self.passwd = passwd + self.host = host + self.port = port + self.dirs = dirs + self.timeout = timeout + self.refcount = 0 + self.keepalive = persistent + try: + self.init() + except: + self.close() + raise + + def init(self): + import ftplib + self.busy = 0 + self.ftp = ftplib.FTP() + self.ftp.connect(self.host, self.port, self.timeout) + self.ftp.login(self.user, self.passwd) + _target = '/'.join(self.dirs) + self.ftp.cwd(_target) + + def retrfile(self, file, type): + import ftplib + self.endtransfer() + if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 + else: cmd = 'TYPE ' + type; isdir = 0 + try: + self.ftp.voidcmd(cmd) + except ftplib.all_errors: + self.init() + self.ftp.voidcmd(cmd) + conn = None + if file and not isdir: + # Try to retrieve as a file + try: + cmd = 'RETR ' + file + conn, retrlen = self.ftp.ntransfercmd(cmd) + except ftplib.error_perm as reason: + if str(reason)[:3] != '550': + raise URLError('ftp error: %r' % reason).with_traceback( + sys.exc_info()[2]) + if not conn: + # Set transfer mode to ASCII! + self.ftp.voidcmd('TYPE A') + # Try a directory listing. Verify that directory exists. + if file: + pwd = self.ftp.pwd() + try: + try: + self.ftp.cwd(file) + except ftplib.error_perm as reason: + raise URLError('ftp error: %r' % reason) from reason + finally: + self.ftp.cwd(pwd) + cmd = 'LIST ' + file + else: + cmd = 'LIST' + conn, retrlen = self.ftp.ntransfercmd(cmd) + self.busy = 1 + + ftpobj = addclosehook(conn.makefile('rb'), self.file_close) + self.refcount += 1 + conn.close() + # Pass back both a suitably decorated object and a retrieval length + return (ftpobj, retrlen) + + def endtransfer(self): + self.busy = 0 + + def close(self): + self.keepalive = False + if self.refcount <= 0: + self.real_close() + + def file_close(self): + self.endtransfer() + self.refcount -= 1 + if self.refcount <= 0 and not self.keepalive: + self.real_close() + + def real_close(self): + self.endtransfer() + try: + self.ftp.close() + except ftperrors(): + pass + +# Proxy handling +def getproxies_environment(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Scan the environment for variables named _proxy; + this seems to be the standard convention. If you need a + different way, you can pass a proxies dictionary to the + [Fancy]URLopener constructor. + + """ + proxies = {} + # in order to prefer lowercase variables, process environment in + # two passes: first matches any, second pass matches lowercase only + for name, value in os.environ.items(): + name = name.lower() + if value and name[-6:] == '_proxy': + proxies[name[:-6]] = value + # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY + # (non-all-lowercase) as it may be set from the web server by a "Proxy:" + # header from the client + # If "proxy" is lowercase, it will still be used thanks to the next block + if 'REQUEST_METHOD' in os.environ: + proxies.pop('http', None) + for name, value in os.environ.items(): + if name[-6:] == '_proxy': + name = name.lower() + if value: + proxies[name[:-6]] = value + else: + proxies.pop(name[:-6], None) + return proxies + +def proxy_bypass_environment(host, proxies=None): + """Test if proxies should not be used for a particular host. + + Checks the proxy dict for the value of no_proxy, which should + be a list of comma separated DNS suffixes, or '*' for all hosts. + + """ + if proxies is None: + proxies = getproxies_environment() + # don't bypass, if no_proxy isn't specified + try: + no_proxy = proxies['no'] + except KeyError: + return 0 + # '*' is special case for always bypass + if no_proxy == '*': + return 1 + # strip port off host + hostonly, port = splitport(host) + # check if the host ends with any of the DNS suffixes + no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')] + for name in no_proxy_list: + if name: + name = re.escape(name) + pattern = r'(.+\.)?%s$' % name + if (re.match(pattern, hostonly, re.I) + or re.match(pattern, host, re.I)): + return 1 + # otherwise, don't bypass + return 0 + + +# This code tests an OSX specific data structure but is testable on all +# platforms +def _proxy_bypass_macosx_sysconf(host, proxy_settings): + """ + Return True iff this host shouldn't be accessed using a proxy + + This function uses the MacOSX framework SystemConfiguration + to fetch the proxy information. + + proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: + { 'exclude_simple': bool, + 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] + } + """ + from fnmatch import fnmatch + + hostonly, port = splitport(host) + + def ip2num(ipAddr): + parts = ipAddr.split('.') + parts = list(map(int, parts)) + if len(parts) != 4: + parts = (parts + [0, 0, 0, 0])[:4] + return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] + + # Check for simple host names: + if '.' not in host: + if proxy_settings['exclude_simple']: + return True + + hostIP = None + + for value in proxy_settings.get('exceptions', ()): + # Items in the list are strings like these: *.local, 169.254/16 + if not value: continue + + m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) + if m is not None: + if hostIP is None: + try: + hostIP = socket.gethostbyname(hostonly) + hostIP = ip2num(hostIP) + except OSError: + continue + + base = ip2num(m.group(1)) + mask = m.group(2) + if mask is None: + mask = 8 * (m.group(1).count('.') + 1) + else: + mask = int(mask[1:]) + mask = 32 - mask + + if (hostIP >> mask) == (base >> mask): + return True + + elif fnmatch(host, value): + return True + + return False + + +if sys.platform == 'darwin': + from _scproxy import _get_proxy_settings, _get_proxies + + def proxy_bypass_macosx_sysconf(host): + proxy_settings = _get_proxy_settings() + return _proxy_bypass_macosx_sysconf(host, proxy_settings) + + def getproxies_macosx_sysconf(): + """Return a dictionary of scheme -> proxy server URL mappings. + + This function uses the MacOSX framework SystemConfiguration + to fetch the proxy information. + """ + return _get_proxies() + + + + def proxy_bypass(host): + """Return True, if host should be bypassed. + + Checks proxy settings gathered from the environment, if specified, + or from the MacOSX framework SystemConfiguration. + + """ + proxies = getproxies_environment() + if proxies: + return proxy_bypass_environment(host, proxies) + else: + return proxy_bypass_macosx_sysconf(host) + + def getproxies(): + return getproxies_environment() or getproxies_macosx_sysconf() + + +elif os.name == 'nt': + def getproxies_registry(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Win32 uses the registry to store proxies. + + """ + proxies = {} + try: + import winreg + except ImportError: + # Std module, so should be around - but you never know! + return proxies + try: + internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, + r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') + proxyEnable = winreg.QueryValueEx(internetSettings, + 'ProxyEnable')[0] + if proxyEnable: + # Returned as Unicode but problems if not converted to ASCII + proxyServer = str(winreg.QueryValueEx(internetSettings, + 'ProxyServer')[0]) + if '=' in proxyServer: + # Per-protocol settings + for p in proxyServer.split(';'): + protocol, address = p.split('=', 1) + # See if address has a type:// prefix + if not re.match('^([^/:]+)://', address): + address = '%s://%s' % (protocol, address) + proxies[protocol] = address + else: + # Use one setting for all protocols + if proxyServer[:5] == 'http:': + proxies['http'] = proxyServer + else: + proxies['http'] = 'http://%s' % proxyServer + proxies['https'] = 'https://%s' % proxyServer + proxies['ftp'] = 'ftp://%s' % proxyServer + internetSettings.Close() + except (OSError, ValueError, TypeError): + # Either registry key not found etc, or the value in an + # unexpected format. + # proxies already set up to be empty so nothing to do + pass + return proxies + + def getproxies(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Returns settings gathered from the environment, if specified, + or the registry. + + """ + return getproxies_environment() or getproxies_registry() + + def proxy_bypass_registry(host): + try: + import winreg + except ImportError: + # Std modules, so should be around - but you never know! + return 0 + try: + internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, + r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') + proxyEnable = winreg.QueryValueEx(internetSettings, + 'ProxyEnable')[0] + proxyOverride = str(winreg.QueryValueEx(internetSettings, + 'ProxyOverride')[0]) + # ^^^^ Returned as Unicode but problems if not converted to ASCII + except OSError: + return 0 + if not proxyEnable or not proxyOverride: + return 0 + # try to make a host list from name and IP address. + rawHost, port = splitport(host) + host = [rawHost] + try: + addr = socket.gethostbyname(rawHost) + if addr != rawHost: + host.append(addr) + except OSError: + pass + try: + fqdn = socket.getfqdn(rawHost) + if fqdn != rawHost: + host.append(fqdn) + except OSError: + pass + # make a check value list from the registry entry: replace the + # '' string by the localhost entry and the corresponding + # canonical entry. + proxyOverride = proxyOverride.split(';') + # now check if we match one of the registry values. + for test in proxyOverride: + if test == '': + if '.' not in rawHost: + return 1 + test = test.replace(".", r"\.") # mask dots + test = test.replace("*", r".*") # change glob sequence + test = test.replace("?", r".") # change glob char + for val in host: + if re.match(test, val, re.I): + return 1 + return 0 + + def proxy_bypass(host): + """Return True, if host should be bypassed. + + Checks proxy settings gathered from the environment, if specified, + or the registry. + + """ + proxies = getproxies_environment() + if proxies: + return proxy_bypass_environment(host, proxies) + else: + return proxy_bypass_registry(host) + +else: + # By default use environment variables + getproxies = getproxies_environment + proxy_bypass = proxy_bypass_environment diff --git a/Lib/urllib/response.py b/Lib/urllib/response.py new file mode 100644 index 0000000000..4778118dbb --- /dev/null +++ b/Lib/urllib/response.py @@ -0,0 +1,80 @@ +"""Response classes used by urllib. + +The base class, addbase, defines a minimal file-like interface, +including read() and readline(). The typical response object is an +addinfourl instance, which defines an info() method that returns +headers and a geturl() method that returns the url. +""" + +import tempfile + +__all__ = ['addbase', 'addclosehook', 'addinfo', 'addinfourl'] + + +class addbase(tempfile._TemporaryFileWrapper): + """Base class for addinfo and addclosehook. Is a good idea for garbage collection.""" + + # XXX Add a method to expose the timeout on the underlying socket? + + def __init__(self, fp): + super(addbase, self).__init__(fp, '', delete=False) + # Keep reference around as this was part of the original API. + self.fp = fp + + def __repr__(self): + return '<%s at %r whose fp = %r>' % (self.__class__.__name__, + id(self), self.file) + + def __enter__(self): + if self.fp.closed: + raise ValueError("I/O operation on closed file") + return self + + def __exit__(self, type, value, traceback): + self.close() + + +class addclosehook(addbase): + """Class to add a close hook to an open file.""" + + def __init__(self, fp, closehook, *hookargs): + super(addclosehook, self).__init__(fp) + self.closehook = closehook + self.hookargs = hookargs + + def close(self): + try: + closehook = self.closehook + hookargs = self.hookargs + if closehook: + self.closehook = None + self.hookargs = None + closehook(*hookargs) + finally: + super(addclosehook, self).close() + + +class addinfo(addbase): + """class to add an info() method to an open file.""" + + def __init__(self, fp, headers): + super(addinfo, self).__init__(fp) + self.headers = headers + + def info(self): + return self.headers + + +class addinfourl(addinfo): + """class to add info() and geturl() methods to an open file.""" + + def __init__(self, fp, headers, url, code=None): + super(addinfourl, self).__init__(fp, headers) + self.url = url + self.code = code + + def getcode(self): + return self.code + + def geturl(self): + return self.url diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py new file mode 100644 index 0000000000..9dab4c1c3a --- /dev/null +++ b/Lib/urllib/robotparser.py @@ -0,0 +1,251 @@ +""" robotparser.py + + Copyright (C) 2000 Bastian Kleineidam + + You can choose between two licenses when using this package: + 1) GNU GPLv2 + 2) PSF license for Python 2.2 + + The robots.txt Exclusion Protocol is implemented as specified in + http://www.robotstxt.org/norobots-rfc.txt +""" + +import collections +import urllib.parse +import urllib.request + +__all__ = ["RobotFileParser"] + +class RobotFileParser: + """ This class provides a set of methods to read, parse and answer + questions about a single robots.txt file. + + """ + + def __init__(self, url=''): + self.entries = [] + self.default_entry = None + self.disallow_all = False + self.allow_all = False + self.set_url(url) + self.last_checked = 0 + + def mtime(self): + """Returns the time the robots.txt file was last fetched. + + This is useful for long-running web spiders that need to + check for new robots.txt files periodically. + + """ + return self.last_checked + + def modified(self): + """Sets the time the robots.txt file was last fetched to the + current time. + + """ + import time + self.last_checked = time.time() + + def set_url(self, url): + """Sets the URL referring to a robots.txt file.""" + self.url = url + self.host, self.path = urllib.parse.urlparse(url)[1:3] + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + try: + f = urllib.request.urlopen(self.url) + except urllib.error.HTTPError as err: + if err.code in (401, 403): + self.disallow_all = True + elif err.code >= 400 and err.code < 500: + self.allow_all = True + else: + raw = f.read() + self.parse(raw.decode("utf-8").splitlines()) + + def _add_entry(self, entry): + if "*" in entry.useragents: + # the default entry is considered last + if self.default_entry is None: + # the first default entry wins + self.default_entry = entry + else: + self.entries.append(entry) + + def parse(self, lines): + """Parse the input lines from a robots.txt file. + + We allow that a user-agent: line is not preceded by + one or more blank lines. + """ + # states: + # 0: start state + # 1: saw user-agent line + # 2: saw an allow or disallow line + state = 0 + entry = Entry() + + self.modified() + for line in lines: + if not line: + if state == 1: + entry = Entry() + state = 0 + elif state == 2: + self._add_entry(entry) + entry = Entry() + state = 0 + # remove optional comment and strip line + i = line.find('#') + if i >= 0: + line = line[:i] + line = line.strip() + if not line: + continue + line = line.split(':', 1) + if len(line) == 2: + line[0] = line[0].strip().lower() + line[1] = urllib.parse.unquote(line[1].strip()) + if line[0] == "user-agent": + if state == 2: + self._add_entry(entry) + entry = Entry() + entry.useragents.append(line[1]) + state = 1 + elif line[0] == "disallow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], False)) + state = 2 + elif line[0] == "allow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], True)) + state = 2 + elif line[0] == "crawl-delay": + if state != 0: + # before trying to convert to int we need to make + # sure that robots.txt has valid syntax otherwise + # it will crash + if line[1].strip().isdigit(): + entry.delay = int(line[1]) + state = 2 + elif line[0] == "request-rate": + if state != 0: + numbers = line[1].split('/') + # check if all values are sane + if (len(numbers) == 2 and numbers[0].strip().isdigit() + and numbers[1].strip().isdigit()): + req_rate = collections.namedtuple('req_rate', + 'requests seconds') + entry.req_rate = req_rate + entry.req_rate.requests = int(numbers[0]) + entry.req_rate.seconds = int(numbers[1]) + state = 2 + if state == 2: + self._add_entry(entry) + + def can_fetch(self, useragent, url): + """using the parsed robots.txt decide if useragent can fetch url""" + if self.disallow_all: + return False + if self.allow_all: + return True + # Until the robots.txt file has been read or found not + # to exist, we must assume that no url is allowable. + # This prevents false positives when a user erroneously + # calls can_fetch() before calling read(). + if not self.last_checked: + return False + # search for given user agent matches + # the first match counts + parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) + url = urllib.parse.urlunparse(('','',parsed_url.path, + parsed_url.params,parsed_url.query, parsed_url.fragment)) + url = urllib.parse.quote(url) + if not url: + url = "/" + for entry in self.entries: + if entry.applies_to(useragent): + return entry.allowance(url) + # try the default entry last + if self.default_entry: + return self.default_entry.allowance(url) + # agent not found ==> access granted + return True + + def crawl_delay(self, useragent): + if not self.mtime(): + return None + for entry in self.entries: + if entry.applies_to(useragent): + return entry.delay + return self.default_entry.delay + + def request_rate(self, useragent): + if not self.mtime(): + return None + for entry in self.entries: + if entry.applies_to(useragent): + return entry.req_rate + return self.default_entry.req_rate + + def __str__(self): + return ''.join([str(entry) + "\n" for entry in self.entries]) + + +class RuleLine: + """A rule line is a single "Allow:" (allowance==True) or "Disallow:" + (allowance==False) followed by a path.""" + def __init__(self, path, allowance): + if path == '' and not allowance: + # an empty value means allow all + allowance = True + path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) + self.path = urllib.parse.quote(path) + self.allowance = allowance + + def applies_to(self, filename): + return self.path == "*" or filename.startswith(self.path) + + def __str__(self): + return ("Allow" if self.allowance else "Disallow") + ": " + self.path + + +class Entry: + """An entry has one or more user-agents and zero or more rulelines""" + def __init__(self): + self.useragents = [] + self.rulelines = [] + self.delay = None + self.req_rate = None + + def __str__(self): + ret = [] + for agent in self.useragents: + ret.extend(["User-agent: ", agent, "\n"]) + for line in self.rulelines: + ret.extend([str(line), "\n"]) + return ''.join(ret) + + def applies_to(self, useragent): + """check if this entry applies to the specified agent""" + # split the name token and make it lower case + useragent = useragent.split("/")[0].lower() + for agent in self.useragents: + if agent == '*': + # we have the catch-all agent + return True + agent = agent.lower() + if agent in useragent: + return True + return False + + def allowance(self, filename): + """Preconditions: + - our agent applies to this entry + - filename is URL decoded""" + for line in self.rulelines: + if line.applies_to(filename): + return line.allowance + return True diff --git a/Lib/uu.py b/Lib/uu.py new file mode 100755 index 0000000000..d68d29374a --- /dev/null +++ b/Lib/uu.py @@ -0,0 +1,199 @@ +#! /usr/bin/env python3 + +# Copyright 1994 by Lance Ellinghouse +# Cathedral City, California Republic, United States of America. +# All Rights Reserved +# Permission to use, copy, modify, and distribute this software and its +# documentation for any purpose and without fee is hereby granted, +# provided that the above copyright notice appear in all copies and that +# both that copyright notice and this permission notice appear in +# supporting documentation, and that the name of Lance Ellinghouse +# not be used in advertising or publicity pertaining to distribution +# of the software without specific, written prior permission. +# LANCE ELLINGHOUSE DISCLAIMS ALL WARRANTIES WITH REGARD TO +# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS, IN NO EVENT SHALL LANCE ELLINGHOUSE CENTRUM BE LIABLE +# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +# Modified by Jack Jansen, CWI, July 1995: +# - Use binascii module to do the actual line-by-line conversion +# between ascii and binary. This results in a 1000-fold speedup. The C +# version is still 5 times faster, though. +# - Arguments more compliant with python standard + +"""Implementation of the UUencode and UUdecode functions. + +encode(in_file, out_file [,name, mode]) +decode(in_file [, out_file, mode]) +""" + +import binascii +import os +import sys + +__all__ = ["Error", "encode", "decode"] + +class Error(Exception): + pass + +def encode(in_file, out_file, name=None, mode=None): + """Uuencode file""" + # + # If in_file is a pathname open it and change defaults + # + opened_files = [] + try: + if in_file == '-': + in_file = sys.stdin.buffer + elif isinstance(in_file, str): + if name is None: + name = os.path.basename(in_file) + if mode is None: + try: + mode = os.stat(in_file).st_mode + except AttributeError: + pass + in_file = open(in_file, 'rb') + opened_files.append(in_file) + # + # Open out_file if it is a pathname + # + if out_file == '-': + out_file = sys.stdout.buffer + elif isinstance(out_file, str): + out_file = open(out_file, 'wb') + opened_files.append(out_file) + # + # Set defaults for name and mode + # + if name is None: + name = '-' + if mode is None: + mode = 0o666 + # + # Write the data + # + out_file.write(('begin %o %s\n' % ((mode & 0o777), name)).encode("ascii")) + data = in_file.read(45) + while len(data) > 0: + out_file.write(binascii.b2a_uu(data)) + data = in_file.read(45) + out_file.write(b' \nend\n') + finally: + for f in opened_files: + f.close() + + +def decode(in_file, out_file=None, mode=None, quiet=False): + """Decode uuencoded file""" + # + # Open the input file, if needed. + # + opened_files = [] + if in_file == '-': + in_file = sys.stdin.buffer + elif isinstance(in_file, str): + in_file = open(in_file, 'rb') + opened_files.append(in_file) + + try: + # + # Read until a begin is encountered or we've exhausted the file + # + while True: + hdr = in_file.readline() + if not hdr: + raise Error('No valid begin line found in input file') + if not hdr.startswith(b'begin'): + continue + hdrfields = hdr.split(b' ', 2) + if len(hdrfields) == 3 and hdrfields[0] == b'begin': + try: + int(hdrfields[1], 8) + break + except ValueError: + pass + if out_file is None: + # If the filename isn't ASCII, what's up with that?!? + out_file = hdrfields[2].rstrip(b' \t\r\n\f').decode("ascii") + if os.path.exists(out_file): + raise Error('Cannot overwrite existing file: %s' % out_file) + if mode is None: + mode = int(hdrfields[1], 8) + # + # Open the output file + # + if out_file == '-': + out_file = sys.stdout.buffer + elif isinstance(out_file, str): + fp = open(out_file, 'wb') + try: + os.path.chmod(out_file, mode) + except AttributeError: + pass + out_file = fp + opened_files.append(out_file) + # + # Main decoding loop + # + s = in_file.readline() + while s and s.strip(b' \t\r\n\f') != b'end': + try: + data = binascii.a2b_uu(s) + except binascii.Error as v: + # Workaround for broken uuencoders by /Fredrik Lundh + nbytes = (((s[0]-32) & 63) * 4 + 5) // 3 + data = binascii.a2b_uu(s[:nbytes]) + if not quiet: + sys.stderr.write("Warning: %s\n" % v) + out_file.write(data) + s = in_file.readline() + if not s: + raise Error('Truncated input file') + finally: + for f in opened_files: + f.close() + +def test(): + """uuencode/uudecode main program""" + + import optparse + parser = optparse.OptionParser(usage='usage: %prog [-d] [-t] [input [output]]') + parser.add_option('-d', '--decode', dest='decode', help='Decode (instead of encode)?', default=False, action='store_true') + parser.add_option('-t', '--text', dest='text', help='data is text, encoded format unix-compatible text?', default=False, action='store_true') + + (options, args) = parser.parse_args() + if len(args) > 2: + parser.error('incorrect number of arguments') + sys.exit(1) + + # Use the binary streams underlying stdin/stdout + input = sys.stdin.buffer + output = sys.stdout.buffer + if len(args) > 0: + input = args[0] + if len(args) > 1: + output = args[1] + + if options.decode: + if options.text: + if isinstance(output, str): + output = open(output, 'wb') + else: + print(sys.argv[0], ': cannot do -t to stdout') + sys.exit(1) + decode(input, output) + else: + if options.text: + if isinstance(input, str): + input = open(input, 'rb') + else: + print(sys.argv[0], ': cannot do -t from stdin') + sys.exit(1) + encode(input, output) + +if __name__ == '__main__': + test() diff --git a/Lib/uuid.py b/Lib/uuid.py new file mode 100644 index 0000000000..200c800b34 --- /dev/null +++ b/Lib/uuid.py @@ -0,0 +1,615 @@ +r"""UUID objects (universally unique identifiers) according to RFC 4122. + +This module provides immutable UUID objects (class UUID) and the functions +uuid1(), uuid3(), uuid4(), uuid5() for generating version 1, 3, 4, and 5 +UUIDs as specified in RFC 4122. + +If all you want is a unique ID, you should probably call uuid1() or uuid4(). +Note that uuid1() may compromise privacy since it creates a UUID containing +the computer's network address. uuid4() creates a random UUID. + +Typical usage: + + >>> import uuid + + # make a UUID based on the host ID and current time + >>> uuid.uuid1() # doctest: +SKIP + UUID('a8098c1a-f86e-11da-bd1a-00112444be1e') + + # make a UUID using an MD5 hash of a namespace UUID and a name + >>> uuid.uuid3(uuid.NAMESPACE_DNS, 'python.org') + UUID('6fa459ea-ee8a-3ca4-894e-db77e160355e') + + # make a random UUID + >>> uuid.uuid4() # doctest: +SKIP + UUID('16fd2706-8baf-433b-82eb-8c7fada847da') + + # make a UUID using a SHA-1 hash of a namespace UUID and a name + >>> uuid.uuid5(uuid.NAMESPACE_DNS, 'python.org') + UUID('886313e1-3b8a-5372-9b90-0c9aee199e5d') + + # make a UUID from a string of hex digits (braces and hyphens ignored) + >>> x = uuid.UUID('{00010203-0405-0607-0809-0a0b0c0d0e0f}') + + # convert a UUID to a string of hex digits in standard form + >>> str(x) + '00010203-0405-0607-0809-0a0b0c0d0e0f' + + # get the raw 16 bytes of the UUID + >>> x.bytes + b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f' + + # make a UUID from a 16-byte string + >>> uuid.UUID(bytes=x.bytes) + UUID('00010203-0405-0607-0809-0a0b0c0d0e0f') +""" + +import os + +__author__ = 'Ka-Ping Yee ' + +RESERVED_NCS, RFC_4122, RESERVED_MICROSOFT, RESERVED_FUTURE = [ + 'reserved for NCS compatibility', 'specified in RFC 4122', + 'reserved for Microsoft compatibility', 'reserved for future definition'] + +int_ = int # The built-in int type +bytes_ = bytes # The built-in bytes type + +class UUID(object): + """Instances of the UUID class represent UUIDs as specified in RFC 4122. + UUID objects are immutable, hashable, and usable as dictionary keys. + Converting a UUID to a string with str() yields something in the form + '12345678-1234-1234-1234-123456789abc'. The UUID constructor accepts + five possible forms: a similar string of hexadecimal digits, or a tuple + of six integer fields (with 32-bit, 16-bit, 16-bit, 8-bit, 8-bit, and + 48-bit values respectively) as an argument named 'fields', or a string + of 16 bytes (with all the integer fields in big-endian order) as an + argument named 'bytes', or a string of 16 bytes (with the first three + fields in little-endian order) as an argument named 'bytes_le', or a + single 128-bit integer as an argument named 'int'. + + UUIDs have these read-only attributes: + + bytes the UUID as a 16-byte string (containing the six + integer fields in big-endian byte order) + + bytes_le the UUID as a 16-byte string (with time_low, time_mid, + and time_hi_version in little-endian byte order) + + fields a tuple of the six integer fields of the UUID, + which are also available as six individual attributes + and two derived attributes: + + time_low the first 32 bits of the UUID + time_mid the next 16 bits of the UUID + time_hi_version the next 16 bits of the UUID + clock_seq_hi_variant the next 8 bits of the UUID + clock_seq_low the next 8 bits of the UUID + node the last 48 bits of the UUID + + time the 60-bit timestamp + clock_seq the 14-bit sequence number + + hex the UUID as a 32-character hexadecimal string + + int the UUID as a 128-bit integer + + urn the UUID as a URN as specified in RFC 4122 + + variant the UUID variant (one of the constants RESERVED_NCS, + RFC_4122, RESERVED_MICROSOFT, or RESERVED_FUTURE) + + version the UUID version number (1 through 5, meaningful only + when the variant is RFC_4122) + """ + + def __init__(self, hex=None, bytes=None, bytes_le=None, fields=None, + int=None, version=None): + r"""Create a UUID from either a string of 32 hexadecimal digits, + a string of 16 bytes as the 'bytes' argument, a string of 16 bytes + in little-endian order as the 'bytes_le' argument, a tuple of six + integers (32-bit time_low, 16-bit time_mid, 16-bit time_hi_version, + 8-bit clock_seq_hi_variant, 8-bit clock_seq_low, 48-bit node) as + the 'fields' argument, or a single 128-bit integer as the 'int' + argument. When a string of hex digits is given, curly braces, + hyphens, and a URN prefix are all optional. For example, these + expressions all yield the same UUID: + + UUID('{12345678-1234-5678-1234-567812345678}') + UUID('12345678123456781234567812345678') + UUID('urn:uuid:12345678-1234-5678-1234-567812345678') + UUID(bytes='\x12\x34\x56\x78'*4) + UUID(bytes_le='\x78\x56\x34\x12\x34\x12\x78\x56' + + '\x12\x34\x56\x78\x12\x34\x56\x78') + UUID(fields=(0x12345678, 0x1234, 0x5678, 0x12, 0x34, 0x567812345678)) + UUID(int=0x12345678123456781234567812345678) + + Exactly one of 'hex', 'bytes', 'bytes_le', 'fields', or 'int' must + be given. The 'version' argument is optional; if given, the resulting + UUID will have its variant and version set according to RFC 4122, + overriding the given 'hex', 'bytes', 'bytes_le', 'fields', or 'int'. + """ + + if [hex, bytes, bytes_le, fields, int].count(None) != 4: + raise TypeError('one of the hex, bytes, bytes_le, fields, ' + 'or int arguments must be given') + if hex is not None: + hex = hex.replace('urn:', '').replace('uuid:', '') + hex = hex.strip('{}').replace('-', '') + if len(hex) != 32: + raise ValueError('badly formed hexadecimal UUID string') + int = int_(hex, 16) + if bytes_le is not None: + if len(bytes_le) != 16: + raise ValueError('bytes_le is not a 16-char string') + bytes = (bytes_le[4-1::-1] + bytes_le[6-1:4-1:-1] + + bytes_le[8-1:6-1:-1] + bytes_le[8:]) + if bytes is not None: + if len(bytes) != 16: + raise ValueError('bytes is not a 16-char string') + assert isinstance(bytes, bytes_), repr(bytes) + int = int_.from_bytes(bytes, byteorder='big') + if fields is not None: + if len(fields) != 6: + raise ValueError('fields is not a 6-tuple') + (time_low, time_mid, time_hi_version, + clock_seq_hi_variant, clock_seq_low, node) = fields + if not 0 <= time_low < 1<<32: + raise ValueError('field 1 out of range (need a 32-bit value)') + if not 0 <= time_mid < 1<<16: + raise ValueError('field 2 out of range (need a 16-bit value)') + if not 0 <= time_hi_version < 1<<16: + raise ValueError('field 3 out of range (need a 16-bit value)') + if not 0 <= clock_seq_hi_variant < 1<<8: + raise ValueError('field 4 out of range (need an 8-bit value)') + if not 0 <= clock_seq_low < 1<<8: + raise ValueError('field 5 out of range (need an 8-bit value)') + if not 0 <= node < 1<<48: + raise ValueError('field 6 out of range (need a 48-bit value)') + clock_seq = (clock_seq_hi_variant << 8) | clock_seq_low + int = ((time_low << 96) | (time_mid << 80) | + (time_hi_version << 64) | (clock_seq << 48) | node) + if int is not None: + if not 0 <= int < 1<<128: + raise ValueError('int is out of range (need a 128-bit value)') + if version is not None: + if not 1 <= version <= 5: + raise ValueError('illegal version number') + # Set the variant to RFC 4122. + int &= ~(0xc000 << 48) + int |= 0x8000 << 48 + # Set the version number. + int &= ~(0xf000 << 64) + int |= version << 76 + self.__dict__['int'] = int + + def __eq__(self, other): + if isinstance(other, UUID): + return self.int == other.int + return NotImplemented + + # Q. What's the value of being able to sort UUIDs? + # A. Use them as keys in a B-Tree or similar mapping. + + def __lt__(self, other): + if isinstance(other, UUID): + return self.int < other.int + return NotImplemented + + def __gt__(self, other): + if isinstance(other, UUID): + return self.int > other.int + return NotImplemented + + def __le__(self, other): + if isinstance(other, UUID): + return self.int <= other.int + return NotImplemented + + def __ge__(self, other): + if isinstance(other, UUID): + return self.int >= other.int + return NotImplemented + + def __hash__(self): + return hash(self.int) + + def __int__(self): + return self.int + + def __repr__(self): + return '%s(%r)' % (self.__class__.__name__, str(self)) + + def __setattr__(self, name, value): + raise TypeError('UUID objects are immutable') + + def __str__(self): + hex = '%032x' % self.int + return '%s-%s-%s-%s-%s' % ( + hex[:8], hex[8:12], hex[12:16], hex[16:20], hex[20:]) + + @property + def bytes(self): + return self.int.to_bytes(16, 'big') + + @property + def bytes_le(self): + bytes = self.bytes + return (bytes[4-1::-1] + bytes[6-1:4-1:-1] + bytes[8-1:6-1:-1] + + bytes[8:]) + + @property + def fields(self): + return (self.time_low, self.time_mid, self.time_hi_version, + self.clock_seq_hi_variant, self.clock_seq_low, self.node) + + @property + def time_low(self): + return self.int >> 96 + + @property + def time_mid(self): + return (self.int >> 80) & 0xffff + + @property + def time_hi_version(self): + return (self.int >> 64) & 0xffff + + @property + def clock_seq_hi_variant(self): + return (self.int >> 56) & 0xff + + @property + def clock_seq_low(self): + return (self.int >> 48) & 0xff + + @property + def time(self): + return (((self.time_hi_version & 0x0fff) << 48) | + (self.time_mid << 32) | self.time_low) + + @property + def clock_seq(self): + return (((self.clock_seq_hi_variant & 0x3f) << 8) | + self.clock_seq_low) + + @property + def node(self): + return self.int & 0xffffffffffff + + @property + def hex(self): + return '%032x' % self.int + + @property + def urn(self): + return 'urn:uuid:' + str(self) + + @property + def variant(self): + if not self.int & (0x8000 << 48): + return RESERVED_NCS + elif not self.int & (0x4000 << 48): + return RFC_4122 + elif not self.int & (0x2000 << 48): + return RESERVED_MICROSOFT + else: + return RESERVED_FUTURE + + @property + def version(self): + # The version bits are only meaningful for RFC 4122 UUIDs. + if self.variant == RFC_4122: + return int((self.int >> 76) & 0xf) + +def _popen(command, *args): + import os, shutil, subprocess + executable = shutil.which(command) + if executable is None: + path = os.pathsep.join(('/sbin', '/usr/sbin')) + executable = shutil.which(command, path=path) + if executable is None: + return None + # LC_ALL=C to ensure English output, stderr=DEVNULL to prevent output + # on stderr (Note: we don't have an example where the words we search + # for are actually localized, but in theory some system could do so.) + env = dict(os.environ) + env['LC_ALL'] = 'C' + proc = subprocess.Popen((executable,) + args, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + env=env) + return proc + +def _find_mac(command, args, hw_identifiers, get_index): + try: + proc = _popen(command, *args.split()) + if not proc: + return + with proc: + for line in proc.stdout: + words = line.lower().rstrip().split() + for i in range(len(words)): + if words[i] in hw_identifiers: + try: + word = words[get_index(i)] + mac = int(word.replace(b':', b''), 16) + if mac: + return mac + except (ValueError, IndexError): + # Virtual interfaces, such as those provided by + # VPNs, do not have a colon-delimited MAC address + # as expected, but a 16-byte HWAddr separated by + # dashes. These should be ignored in favor of a + # real MAC address + pass + except OSError: + pass + +def _ifconfig_getnode(): + """Get the hardware address on Unix by running ifconfig.""" + # This works on Linux ('' or '-a'), Tru64 ('-av'), but not all Unixes. + for args in ('', '-a', '-av'): + mac = _find_mac('ifconfig', args, [b'hwaddr', b'ether'], lambda i: i+1) + if mac: + return mac + +def _ip_getnode(): + """Get the hardware address on Unix by running ip.""" + # This works on Linux with iproute2. + mac = _find_mac('ip', 'link list', [b'link/ether'], lambda i: i+1) + if mac: + return mac + +def _arp_getnode(): + """Get the hardware address on Unix by running arp.""" + import os, socket + try: + ip_addr = socket.gethostbyname(socket.gethostname()) + except OSError: + return None + + # Try getting the MAC addr from arp based on our IP address (Solaris). + return _find_mac('arp', '-an', [os.fsencode(ip_addr)], lambda i: -1) + +def _lanscan_getnode(): + """Get the hardware address on Unix by running lanscan.""" + # This might work on HP-UX. + return _find_mac('lanscan', '-ai', [b'lan0'], lambda i: 0) + +def _netstat_getnode(): + """Get the hardware address on Unix by running netstat.""" + # This might work on AIX, Tru64 UNIX and presumably on IRIX. + try: + proc = _popen('netstat', '-ia') + if not proc: + return + with proc: + words = proc.stdout.readline().rstrip().split() + try: + i = words.index(b'Address') + except ValueError: + return + for line in proc.stdout: + try: + words = line.rstrip().split() + word = words[i] + if len(word) == 17 and word.count(b':') == 5: + mac = int(word.replace(b':', b''), 16) + if mac: + return mac + except (ValueError, IndexError): + pass + except OSError: + pass + +def _ipconfig_getnode(): + """Get the hardware address on Windows by running ipconfig.exe.""" + import os, re + dirs = ['', r'c:\windows\system32', r'c:\winnt\system32'] + try: + import ctypes + buffer = ctypes.create_string_buffer(300) + ctypes.windll.kernel32.GetSystemDirectoryA(buffer, 300) + dirs.insert(0, buffer.value.decode('mbcs')) + except: + pass + for dir in dirs: + try: + pipe = os.popen(os.path.join(dir, 'ipconfig') + ' /all') + except OSError: + continue + with pipe: + for line in pipe: + value = line.split(':')[-1].strip().lower() + if re.match('([0-9a-f][0-9a-f]-){5}[0-9a-f][0-9a-f]', value): + return int(value.replace('-', ''), 16) + +def _netbios_getnode(): + """Get the hardware address on Windows using NetBIOS calls. + See http://support.microsoft.com/kb/118623 for details.""" + import win32wnet, netbios + ncb = netbios.NCB() + ncb.Command = netbios.NCBENUM + ncb.Buffer = adapters = netbios.LANA_ENUM() + adapters._pack() + if win32wnet.Netbios(ncb) != 0: + return + adapters._unpack() + for i in range(adapters.length): + ncb.Reset() + ncb.Command = netbios.NCBRESET + ncb.Lana_num = ord(adapters.lana[i]) + if win32wnet.Netbios(ncb) != 0: + continue + ncb.Reset() + ncb.Command = netbios.NCBASTAT + ncb.Lana_num = ord(adapters.lana[i]) + ncb.Callname = '*'.ljust(16) + ncb.Buffer = status = netbios.ADAPTER_STATUS() + if win32wnet.Netbios(ncb) != 0: + continue + status._unpack() + bytes = status.adapter_address[:6] + if len(bytes) != 6: + continue + return int.from_bytes(bytes, 'big') + +# Thanks to Thomas Heller for ctypes and for his help with its use here. + +# If ctypes is available, use it to find system routines for UUID generation. +# XXX This makes the module non-thread-safe! +_uuid_generate_time = _UuidCreate = None +try: + import ctypes, ctypes.util + import sys + + # The uuid_generate_* routines are provided by libuuid on at least + # Linux and FreeBSD, and provided by libc on Mac OS X. + _libnames = ['uuid'] + if not sys.platform.startswith('win'): + _libnames.append('c') + for libname in _libnames: + try: + lib = ctypes.CDLL(ctypes.util.find_library(libname)) + except Exception: + continue + if hasattr(lib, 'uuid_generate_time'): + _uuid_generate_time = lib.uuid_generate_time + break + del _libnames + + # The uuid_generate_* functions are broken on MacOS X 10.5, as noted + # in issue #8621 the function generates the same sequence of values + # in the parent process and all children created using fork (unless + # those children use exec as well). + # + # Assume that the uuid_generate functions are broken from 10.5 onward, + # the test can be adjusted when a later version is fixed. + if sys.platform == 'darwin': + if int(os.uname().release.split('.')[0]) >= 9: + _uuid_generate_time = None + + # On Windows prior to 2000, UuidCreate gives a UUID containing the + # hardware address. On Windows 2000 and later, UuidCreate makes a + # random UUID and UuidCreateSequential gives a UUID containing the + # hardware address. These routines are provided by the RPC runtime. + # NOTE: at least on Tim's WinXP Pro SP2 desktop box, while the last + # 6 bytes returned by UuidCreateSequential are fixed, they don't appear + # to bear any relationship to the MAC address of any network device + # on the box. + try: + lib = ctypes.windll.rpcrt4 + except: + lib = None + _UuidCreate = getattr(lib, 'UuidCreateSequential', + getattr(lib, 'UuidCreate', None)) +except: + pass + +def _unixdll_getnode(): + """Get the hardware address on Unix using ctypes.""" + _buffer = ctypes.create_string_buffer(16) + _uuid_generate_time(_buffer) + return UUID(bytes=bytes_(_buffer.raw)).node + +def _windll_getnode(): + """Get the hardware address on Windows using ctypes.""" + _buffer = ctypes.create_string_buffer(16) + if _UuidCreate(_buffer) == 0: + return UUID(bytes=bytes_(_buffer.raw)).node + +def _random_getnode(): + """Get a random node ID, with eighth bit set as suggested by RFC 4122.""" + import random + return random.getrandbits(48) | 0x010000000000 + +_node = None + +def getnode(): + """Get the hardware address as a 48-bit positive integer. + + The first time this runs, it may launch a separate program, which could + be quite slow. If all attempts to obtain the hardware address fail, we + choose a random 48-bit number with its eighth bit set to 1 as recommended + in RFC 4122. + """ + + global _node + if _node is not None: + return _node + + import sys + if sys.platform == 'win32': + getters = [_windll_getnode, _netbios_getnode, _ipconfig_getnode] + else: + getters = [_unixdll_getnode, _ifconfig_getnode, _ip_getnode, + _arp_getnode, _lanscan_getnode, _netstat_getnode] + + for getter in getters + [_random_getnode]: + try: + _node = getter() + except: + continue + if _node is not None: + return _node + +_last_timestamp = None + +def uuid1(node=None, clock_seq=None): + """Generate a UUID from a host ID, sequence number, and the current time. + If 'node' is not given, getnode() is used to obtain the hardware + address. If 'clock_seq' is given, it is used as the sequence number; + otherwise a random 14-bit sequence number is chosen.""" + + # When the system provides a version-1 UUID generator, use it (but don't + # use UuidCreate here because its UUIDs don't conform to RFC 4122). + if _uuid_generate_time and node is clock_seq is None: + _buffer = ctypes.create_string_buffer(16) + _uuid_generate_time(_buffer) + return UUID(bytes=bytes_(_buffer.raw)) + + global _last_timestamp + import time + nanoseconds = int(time.time() * 1e9) + # 0x01b21dd213814000 is the number of 100-ns intervals between the + # UUID epoch 1582-10-15 00:00:00 and the Unix epoch 1970-01-01 00:00:00. + timestamp = int(nanoseconds/100) + 0x01b21dd213814000 + if _last_timestamp is not None and timestamp <= _last_timestamp: + timestamp = _last_timestamp + 1 + _last_timestamp = timestamp + if clock_seq is None: + import random + clock_seq = random.getrandbits(14) # instead of stable storage + time_low = timestamp & 0xffffffff + time_mid = (timestamp >> 32) & 0xffff + time_hi_version = (timestamp >> 48) & 0x0fff + clock_seq_low = clock_seq & 0xff + clock_seq_hi_variant = (clock_seq >> 8) & 0x3f + if node is None: + node = getnode() + return UUID(fields=(time_low, time_mid, time_hi_version, + clock_seq_hi_variant, clock_seq_low, node), version=1) + +def uuid3(namespace, name): + """Generate a UUID from the MD5 hash of a namespace UUID and a name.""" + from hashlib import md5 + hash = md5(namespace.bytes + bytes(name, "utf-8")).digest() + return UUID(bytes=hash[:16], version=3) + +def uuid4(): + """Generate a random UUID.""" + return UUID(bytes=os.urandom(16), version=4) + +def uuid5(namespace, name): + """Generate a UUID from the SHA-1 hash of a namespace UUID and a name.""" + from hashlib import sha1 + hash = sha1(namespace.bytes + bytes(name, "utf-8")).digest() + return UUID(bytes=hash[:16], version=5) + +# The following standard UUIDs are for use with uuid3() or uuid5(). + +NAMESPACE_DNS = UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') +NAMESPACE_URL = UUID('6ba7b811-9dad-11d1-80b4-00c04fd430c8') +NAMESPACE_OID = UUID('6ba7b812-9dad-11d1-80b4-00c04fd430c8') +NAMESPACE_X500 = UUID('6ba7b814-9dad-11d1-80b4-00c04fd430c8') From abb190ce50f1075e036f27822f986a63cb12cba6 Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Mon, 2 Dec 2019 21:15:37 -0600 Subject: [PATCH 2/7] Some small fixes for http to work --- Lib/hmac.py | 3 ++- Lib/socketserver.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Lib/hmac.py b/Lib/hmac.py index 121029aa67..0a0b3cee74 100644 --- a/Lib/hmac.py +++ b/Lib/hmac.py @@ -4,7 +4,8 @@ Implements the HMAC algorithm as described by RFC 2104. """ import warnings as _warnings -from _operator import _compare_digest as compare_digest +# XXX RustPython TODO: _operator +#from _operator import _compare_digest as compare_digest import hashlib as _hashlib trans_5C = bytes((x ^ 0x5C) for x in range(256)) diff --git a/Lib/socketserver.py b/Lib/socketserver.py index 41a3766772..b654c06d84 100644 --- a/Lib/socketserver.py +++ b/Lib/socketserver.py @@ -773,8 +773,10 @@ class _SocketWriter(BufferedIOBase): def write(self, b): self._sock.sendall(b) - with memoryview(b) as view: - return view.nbytes + # XXX RustPython TODO: implement memoryview properly + #with memoryview(b) as view: + # return view.nbytes + return len(b) def fileno(self): return self._sock.fileno() From bd4a492a53239c014efa051b36e0d71ce5358f8f Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Wed, 4 Dec 2019 17:54:58 -0600 Subject: [PATCH 3/7] Add fast path for new byte inner of bytes or bytearray --- vm/src/obj/objbyteinner.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vm/src/obj/objbyteinner.rs b/vm/src/obj/objbyteinner.rs index bf8f802c54..ac6e5c841e 100644 --- a/vm/src/obj/objbyteinner.rs +++ b/vm/src/obj/objbyteinner.rs @@ -85,6 +85,8 @@ impl ByteInnerNewOptions { vm.new_type_error("string argument without an encoding".to_string()) ); } + i @ PyBytes => Ok(i.get_value().to_vec()), + j @ PyByteArray => Ok(j.inner.borrow().elements.to_vec()), obj => { // TODO: only support this method in the bytes() constructor if let Some(bytes_method) = vm.get_method(obj.clone(), "__bytes__") { From dcbf1a7ac4f391c565d4eb887c4ddd890284b457 Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Wed, 4 Dec 2019 22:57:58 -0600 Subject: [PATCH 4/7] Add socket.setsockopt --- vm/src/stdlib/socket.rs | 87 +++++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 11 deletions(-) diff --git a/vm/src/stdlib/socket.rs b/vm/src/stdlib/socket.rs index da384eea36..44b0d04d09 100644 --- a/vm/src/stdlib/socket.rs +++ b/vm/src/stdlib/socket.rs @@ -179,10 +179,12 @@ impl PySocket { #[pymethod] fn recv(&self, bufsize: usize, vm: &VirtualMachine) -> PyResult> { let mut buffer = vec![0u8; bufsize]; - match self.sock.borrow_mut().read_exact(&mut buffer) { - Ok(()) => Ok(buffer), - Err(err) => Err(convert_sock_error(vm, err)), - } + let n = self + .sock() + .recv(&mut buffer) + .map_err(|err| convert_sock_error(vm, err))?; + buffer.truncate(n); + Ok(buffer) } #[pymethod] @@ -196,10 +198,12 @@ impl PySocket { #[pymethod] fn recvfrom(&self, bufsize: usize, vm: &VirtualMachine) -> PyResult<(Vec, AddrTuple)> { let mut buffer = vec![0u8; bufsize]; - match self.sock().recv_from(&mut buffer) { - Ok((_, addr)) => Ok((buffer, get_addr_tuple(addr))), - Err(err) => Err(convert_sock_error(vm, err)), - } + let (n, addr) = self + .sock() + .recv_from(&mut buffer) + .map_err(|err| convert_sock_error(vm, err))?; + buffer.truncate(n); + Ok((buffer, get_addr_tuple(addr))) } #[pymethod] @@ -276,16 +280,73 @@ impl PySocket { } #[pymethod] - fn settimeout(&self, timeout: Option, vm: &VirtualMachine) -> PyResult<()> { + fn settimeout(&self, timeout: Option>, vm: &VirtualMachine) -> PyResult<()> { + let mut block = if timeout.is_none() { Some(true) } else { None }; + let timeout = timeout.and_then(|n| { + let dur = match n { + Either::A(f) => Duration::from_secs_f64(f), + Either::B(i) => Duration::from_secs(i), + }; + if dur == Duration::from_secs(0) { + block = Some(false); + None + } else { + Some(dur) + } + }); self.sock() - .set_read_timeout(timeout.map(Duration::from_secs_f64)) + .set_read_timeout(timeout) .map_err(|err| convert_sock_error(vm, err))?; self.sock() - .set_write_timeout(timeout.map(Duration::from_secs_f64)) + .set_write_timeout(timeout) .map_err(|err| convert_sock_error(vm, err))?; + if let Some(blocking) = block { + self.sock() + .set_nonblocking(!blocking) + .map_err(|err| convert_sock_error(vm, err))?; + } Ok(()) } + #[pymethod] + fn setsockopt( + &self, + level: i32, + name: i32, + value: Option>, + optlen: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult<()> { + let fd = sock_fileno(&self.sock()); + let ret = match (value, optlen) { + (Some(Either::A(b)), OptionalArg::Missing) => b.with_ref(|b| unsafe { + c::setsockopt(fd, level, name, b.as_ptr() as *const _, b.len() as _) + }), + (Some(Either::B(ref val)), OptionalArg::Missing) => unsafe { + c::setsockopt( + fd, + level, + name, + val as *const i32 as *const _, + std::mem::size_of::() as _, + ) + }, + (None, OptionalArg::Present(optlen)) => unsafe { + c::setsockopt(fd, level, name, std::ptr::null(), optlen as _) + }, + _ => { + return Err( + vm.new_type_error("expected the value arg xor the optlen arg".to_string()) + ); + } + }; + if ret < 0 { + Err(convert_sock_error(vm, io::Error::last_os_error())) + } else { + Ok(()) + } + } + #[pymethod] fn shutdown(&self, how: i32, vm: &VirtualMachine) -> PyResult<()> { let how = match how { @@ -570,6 +631,10 @@ pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { "IPPROTO_IPIP" => ctx.new_int(c::IPPROTO_IP), "IPPROTO_IPV6" => ctx.new_int(c::IPPROTO_IPV6), "IPPROTO_NONE" => ctx.new_int(c::IPPROTO_NONE), + "SOL_SOCKET" => ctx.new_int(c::SOL_SOCKET), + "SO_REUSEADDR" => ctx.new_int(c::SO_REUSEADDR), + "TCP_NODELAY" => ctx.new_int(c::TCP_NODELAY), + "SO_BROADCAST" => ctx.new_int(c::SO_BROADCAST), "socket" => PySocket::make_class(ctx), "inet_aton" => ctx.new_rustfunc(socket_inet_aton), "inet_ntoa" => ctx.new_rustfunc(socket_inet_ntoa), From 5f142387372b5ca980bc03869d18c67a6fd43a8a Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Wed, 4 Dec 2019 22:58:14 -0600 Subject: [PATCH 5/7] Add BytesIO.truncate --- vm/src/stdlib/io.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vm/src/stdlib/io.rs b/vm/src/stdlib/io.rs index 7cd1e893de..ecf1c9d3b2 100644 --- a/vm/src/stdlib/io.rs +++ b/vm/src/stdlib/io.rs @@ -288,6 +288,13 @@ impl PyBytesIORef { } } + fn truncate(self, size: OptionalOption, vm: &VirtualMachine) -> PyResult<()> { + let mut buffer = self.buffer(vm)?; + let size = size.flat_option().unwrap_or_else(|| buffer.tell() as usize); + buffer.cursor.get_mut().truncate(size); + Ok(()) + } + fn closed(self, _vm: &VirtualMachine) -> bool { self.buffer.borrow().is_none() } @@ -1027,6 +1034,7 @@ pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { "getvalue" => ctx.new_rustfunc(PyBytesIORef::getvalue), "tell" => ctx.new_rustfunc(PyBytesIORef::tell), "readline" => ctx.new_rustfunc(PyBytesIORef::readline), + "truncate" => ctx.new_rustfunc(PyBytesIORef::truncate), "closed" => ctx.new_property(PyBytesIORef::closed), "close" => ctx.new_rustfunc(PyBytesIORef::close), }); From 32d1016a7f85e8e5d8d08b42f0516913f3dba5cc Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Tue, 17 Dec 2019 00:11:36 -0600 Subject: [PATCH 6/7] Add a TryFromObject impl for std::time::Duration --- vm/src/pyobject.rs | 16 ++++++++++++++++ vm/src/stdlib/socket.rs | 23 +++++++++-------------- vm/src/stdlib/time_module.rs | 9 ++++----- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/vm/src/pyobject.rs b/vm/src/pyobject.rs index 9ac099ff0e..3cf62c49bd 100644 --- a/vm/src/pyobject.rs +++ b/vm/src/pyobject.rs @@ -1183,6 +1183,22 @@ pub trait PyClassImpl: PyClassDef { } } +// TODO: find a better place to put this impl +impl TryFromObject for std::time::Duration { + fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult { + use std::time::Duration; + u64::try_from_object(vm, obj.clone()) + .map(Duration::from_secs) + .or_else(|_| f64::try_from_object(vm, obj.clone()).map(Duration::from_secs_f64)) + .map_err(|_| { + vm.new_type_error(format!( + "expected an int or float for duration, got {}", + obj.class() + )) + }) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/vm/src/stdlib/socket.rs b/vm/src/stdlib/socket.rs index 44b0d04d09..bb4ee90715 100644 --- a/vm/src/stdlib/socket.rs +++ b/vm/src/stdlib/socket.rs @@ -280,20 +280,15 @@ impl PySocket { } #[pymethod] - fn settimeout(&self, timeout: Option>, vm: &VirtualMachine) -> PyResult<()> { - let mut block = if timeout.is_none() { Some(true) } else { None }; - let timeout = timeout.and_then(|n| { - let dur = match n { - Either::A(f) => Duration::from_secs_f64(f), - Either::B(i) => Duration::from_secs(i), - }; - if dur == Duration::from_secs(0) { - block = Some(false); - None - } else { - Some(dur) - } - }); + fn settimeout(&self, timeout: Option, vm: &VirtualMachine) -> PyResult<()> { + // timeout is None: blocking, no timeout + // timeout is 0: non-blocking, no timeout + // otherwise: timeout is timeout, don't change blocking + let (block, timeout) = match timeout { + None => (Some(true), None), + Some(d) if d == Duration::from_secs(0) => (Some(false), None), + Some(d) => (None, Some(d)), + }; self.sock() .set_read_timeout(timeout) .map_err(|err| convert_sock_error(vm, err))?; diff --git a/vm/src/stdlib/time_module.rs b/vm/src/stdlib/time_module.rs index cb5999390d..9b82ef0bc8 100644 --- a/vm/src/stdlib/time_module.rs +++ b/vm/src/stdlib/time_module.rs @@ -15,9 +15,8 @@ use crate::pyobject::{Either, PyClassImpl, PyObjectRef, PyResult, TryFromObject} use crate::vm::VirtualMachine; #[cfg(unix)] -fn time_sleep(seconds: f64, vm: &VirtualMachine) -> PyResult<()> { - // this is basically std::thread::sleep, but that catches interrupts and we don't want to - let dur = Duration::from_secs_f64(seconds); +fn time_sleep(dur: Duration, vm: &VirtualMachine) -> PyResult<()> { + // this is basically std::thread::sleep, but that catches interrupts and we don't want to; let mut ts = libc::timespec { tv_sec: std::cmp::min(libc::time_t::max_value() as u64, dur.as_secs()) as libc::time_t, @@ -34,8 +33,8 @@ fn time_sleep(seconds: f64, vm: &VirtualMachine) -> PyResult<()> { } #[cfg(not(unix))] -fn time_sleep(seconds: f64, _vm: &VirtualMachine) { - std::thread::sleep(Duration::from_secs_f64(seconds)); +fn time_sleep(dur: Duration, _vm: &VirtualMachine) { + std::thread::sleep(dur); } #[cfg(any(not(target_arch = "wasm32"), target_os = "wasi"))] From a48afdd087db272436cad2ac0716ec4f187f8a98 Mon Sep 17 00:00:00 2001 From: coolreader18 <33094578+coolreader18@users.noreply.github.com> Date: Sun, 22 Dec 2019 22:11:45 -0600 Subject: [PATCH 7/7] Add compilation on windows --- vm/src/stdlib/socket.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vm/src/stdlib/socket.rs b/vm/src/stdlib/socket.rs index bb4ee90715..e1fc6b693e 100644 --- a/vm/src/stdlib/socket.rs +++ b/vm/src/stdlib/socket.rs @@ -62,7 +62,7 @@ mod c { pub use winapi::shared::ws2def::*; pub use winapi::um::winsock2::{ SD_BOTH as SHUT_RDWR, SD_RECEIVE as SHUT_RD, SD_SEND as SHUT_WR, SOCK_DGRAM, SOCK_RAW, - SOCK_RDM, SOCK_STREAM, *, + SOCK_RDM, SOCK_STREAM, SOL_SOCKET, SO_BROADCAST, SO_REUSEADDR, *, }; } @@ -312,7 +312,7 @@ impl PySocket { optlen: OptionalArg, vm: &VirtualMachine, ) -> PyResult<()> { - let fd = sock_fileno(&self.sock()); + let fd = sock_fileno(&self.sock()) as _; let ret = match (value, optlen) { (Some(Either::A(b)), OptionalArg::Missing) => b.with_ref(|b| unsafe { c::setsockopt(fd, level, name, b.as_ptr() as *const _, b.len() as _)