diff --git a/Lib/test/cjkencodings/big5-utf8.txt b/Lib/test/cjkencodings/big5-utf8.txt new file mode 100644 index 000000000..a0a534a96 --- /dev/null +++ b/Lib/test/cjkencodings/big5-utf8.txt @@ -0,0 +1,9 @@ +如何在 Python 中使用既有的 C library? + 在資訊科技快速發展的今天, 開發及測試軟體的速度是不容忽視的 +課題. 為加快開發及測試的速度, 我們便常希望能利用一些已開發好的 +library, 並有一個 fast prototyping 的 programming language 可 +供使用. 目前有許許多多的 library 是以 C 寫成, 而 Python 是一個 +fast prototyping 的 programming language. 故我們希望能將既有的 +C library 拿到 Python 的環境中測試及整合. 其中最主要也是我們所 +要討論的問題就是: + diff --git a/Lib/test/cjkencodings/big5.txt b/Lib/test/cjkencodings/big5.txt new file mode 100644 index 000000000..f4424959e --- /dev/null +++ b/Lib/test/cjkencodings/big5.txt @@ -0,0 +1,9 @@ +pb Python ϥάJ C library? +@bTާֳtoi, }oδճn骺t׬Oe +D. [ֶ}oδժt, ڭ̫K`ƱQΤ@Ǥw}on +library, æ@ fast prototyping programming language i +Ѩϥ. ثe\\hh library OH C g, Python O@ +fast prototyping programming language. Gڭ̧ƱNJ +C library Python ҤդξX. 䤤̥Dn]Oڭ̩ +nQתDNO: + diff --git a/Lib/test/cjkencodings/big5hkscs-utf8.txt b/Lib/test/cjkencodings/big5hkscs-utf8.txt new file mode 100644 index 000000000..f744ce9ae --- /dev/null +++ b/Lib/test/cjkencodings/big5hkscs-utf8.txt @@ -0,0 +1,2 @@ +𠄌Ě鵮罓洆 +ÊÊ̄ê êê̄ diff --git a/Lib/test/cjkencodings/big5hkscs.txt b/Lib/test/cjkencodings/big5hkscs.txt new file mode 100644 index 000000000..81c42b350 --- /dev/null +++ b/Lib/test/cjkencodings/big5hkscs.txt @@ -0,0 +1,2 @@ +E\sڍ +fb diff --git a/Lib/test/cjkencodings/cp949-utf8.txt b/Lib/test/cjkencodings/cp949-utf8.txt new file mode 100644 index 000000000..5655e3851 --- /dev/null +++ b/Lib/test/cjkencodings/cp949-utf8.txt @@ -0,0 +1,9 @@ +똠방각하 펲시콜라 + +㉯㉯납!! 因九月패믤릔궈 ⓡⓖ훀¿¿¿ 긍뒙 ⓔ뎨 ㉯. . +亞영ⓔ능횹 . . . . 서울뤄 뎐학乙 家훀 ! ! !ㅠ.ㅠ +흐흐흐 ㄱㄱㄱ☆ㅠ_ㅠ 어릨 탸콰긐 뎌응 칑九들乙 ㉯드긐 +설릌 家훀 . . . . 굴애쉌 ⓔ궈 ⓡ릘㉱긐 因仁川女中까즼 +와쒀훀 ! ! 亞영ⓔ 家능궈 ☆上관 없능궈능 亞능뒈훀 글애듴 +ⓡ려듀九 싀풔숴훀 어릨 因仁川女中싁⑨들앜!! ㉯㉯납♡ ⌒⌒* + diff --git a/Lib/test/cjkencodings/cp949.txt b/Lib/test/cjkencodings/cp949.txt new file mode 100644 index 000000000..16549aa5e --- /dev/null +++ b/Lib/test/cjkencodings/cp949.txt @@ -0,0 +1,9 @@ +c氢 ݶ + +!! Вp ިR ѵ . . +䬿Ѵ . . . . ʫR ! ! !. + ٤_  O h O +j ʫR . . . . ֚f ѱ ސtƒO  +;R ! ! 䬿 ʫɱ ߾ ɱŴ 䬴ɵR ۾֊ +޷ ǴR  Ĩ!! ҡ* + diff --git a/Lib/test/cjkencodings/euc_jisx0213-utf8.txt b/Lib/test/cjkencodings/euc_jisx0213-utf8.txt new file mode 100644 index 000000000..9a56a2e18 --- /dev/null +++ b/Lib/test/cjkencodings/euc_jisx0213-utf8.txt @@ -0,0 +1,8 @@ +Python の開発は、1990 年ごろから開始されています。 +開発者の Guido van Rossum は教育用のプログラミング言語「ABC」の開発に参加していましたが、ABC は実用上の目的にはあまり適していませんでした。 +このため、Guido はより実用的なプログラミング言語の開発を開始し、英国 BBS 放送のコメディ番組「モンティ パイソン」のファンである Guido はこの言語を「Python」と名づけました。 +このような背景から生まれた Python の言語設計は、「シンプル」で「習得が容易」という目標に重点が置かれています。 +多くのスクリプト系言語ではユーザの目先の利便性を優先して色々な機能を言語要素として取り入れる場合が多いのですが、Python ではそういった小細工が追加されることはあまりありません。 +言語自体の機能は最小限に押さえ、必要な機能は拡張モジュールとして追加する、というのが Python のポリシーです。 + +ノか゚ ト゚ トキ喝塀 𡚴𪎌 麀齁𩛰 diff --git a/Lib/test/cjkencodings/euc_jisx0213.txt b/Lib/test/cjkencodings/euc_jisx0213.txt new file mode 100644 index 000000000..51e9268ca --- /dev/null +++ b/Lib/test/cjkencodings/euc_jisx0213.txt @@ -0,0 +1,8 @@ +Python γȯϡ1990 ǯ鳫ϤƤޤ +ȯԤ Guido van Rossum ϶ѤΥץߥ󥰸ABCפγȯ˻äƤޤABC ϼѾŪˤϤޤŬƤޤǤ +ΤᡢGuido ϤŪʥץߥ󥰸γȯ򳫻Ϥѹ BBS Υǥȡ֥ƥ ѥפΥեǤ Guido ϤθPythonפ̾Ťޤ +Τ褦طʤޤ줿 Python θ߷פϡ֥ץפǡֽưספȤɸ˽֤Ƥޤ +¿ΥץȷϸǤϥ桼ͥ褷ƿʵǽǤȤƼ礬¿ΤǤPython ǤϤäٹɲä뤳ȤϤޤꤢޤ +켫ΤεǽϺǾ¤˲ɬפʵǽϳĥ⥸塼Ȥɲä롢ȤΤ Python ΥݥꥷǤ + +Τ ȥ ԏ diff --git a/Lib/test/cjkencodings/euc_jp-utf8.txt b/Lib/test/cjkencodings/euc_jp-utf8.txt new file mode 100644 index 000000000..7763250eb --- /dev/null +++ b/Lib/test/cjkencodings/euc_jp-utf8.txt @@ -0,0 +1,7 @@ +Python の開発は、1990 年ごろから開始されています。 +開発者の Guido van Rossum は教育用のプログラミング言語「ABC」の開発に参加していましたが、ABC は実用上の目的にはあまり適していませんでした。 +このため、Guido はより実用的なプログラミング言語の開発を開始し、英国 BBS 放送のコメディ番組「モンティ パイソン」のファンである Guido はこの言語を「Python」と名づけました。 +このような背景から生まれた Python の言語設計は、「シンプル」で「習得が容易」という目標に重点が置かれています。 +多くのスクリプト系言語ではユーザの目先の利便性を優先して色々な機能を言語要素として取り入れる場合が多いのですが、Python ではそういった小細工が追加されることはあまりありません。 +言語自体の機能は最小限に押さえ、必要な機能は拡張モジュールとして追加する、というのが Python のポリシーです。 + diff --git a/Lib/test/cjkencodings/euc_jp.txt b/Lib/test/cjkencodings/euc_jp.txt new file mode 100644 index 000000000..9da6b5d83 --- /dev/null +++ b/Lib/test/cjkencodings/euc_jp.txt @@ -0,0 +1,7 @@ +Python γȯϡ1990 ǯ鳫ϤƤޤ +ȯԤ Guido van Rossum ϶ѤΥץߥ󥰸ABCפγȯ˻äƤޤABC ϼѾŪˤϤޤŬƤޤǤ +ΤᡢGuido ϤŪʥץߥ󥰸γȯ򳫻Ϥѹ BBS Υǥȡ֥ƥ ѥפΥեǤ Guido ϤθPythonפ̾Ťޤ +Τ褦طʤޤ줿 Python θ߷פϡ֥ץפǡֽưספȤɸ˽֤Ƥޤ +¿ΥץȷϸǤϥ桼ͥ褷ƿʵǽǤȤƼ礬¿ΤǤPython ǤϤäٹɲä뤳ȤϤޤꤢޤ +켫ΤεǽϺǾ¤˲ɬפʵǽϳĥ⥸塼Ȥɲä롢ȤΤ Python ΥݥꥷǤ + diff --git a/Lib/test/cjkencodings/euc_kr-utf8.txt b/Lib/test/cjkencodings/euc_kr-utf8.txt new file mode 100644 index 000000000..16c37412b --- /dev/null +++ b/Lib/test/cjkencodings/euc_kr-utf8.txt @@ -0,0 +1,7 @@ +◎ 파이썬(Python)은 배우기 쉽고, 강력한 프로그래밍 언어입니다. 파이썬은 +효율적인 고수준 데이터 구조와 간단하지만 효율적인 객체지향프로그래밍을 +지원합니다. 파이썬의 우아(優雅)한 문법과 동적 타이핑, 그리고 인터프리팅 +환경은 파이썬을 스크립팅과 여러 분야에서와 대부분의 플랫폼에서의 빠른 +애플리케이션 개발을 할 수 있는 이상적인 언어로 만들어줍니다. + +☆첫가끝: 날아라 쓔쓔쓩~ 닁큼! 뜽금없이 전홥니다. 뷁. 그런거 읎다. diff --git a/Lib/test/cjkencodings/euc_kr.txt b/Lib/test/cjkencodings/euc_kr.txt new file mode 100644 index 000000000..f68dd3502 --- /dev/null +++ b/Lib/test/cjkencodings/euc_kr.txt @@ -0,0 +1,7 @@ + ̽(Python) , α׷ Դϴ. ̽ +ȿ ȿ üα׷ +մϴ. ̽ () Ÿ, ׸ +ȯ ̽ ũð о߿ κ ÷ +ø̼ ִ ̻ ݴϴ. + +ù: ƶ ԤФԤԤФԾ~ ԤҤŭ! ԤѤݾ ԤȤϴ. ԤΤ. ׷ ԤѤ. diff --git a/Lib/test/cjkencodings/gb18030-utf8.txt b/Lib/test/cjkencodings/gb18030-utf8.txt new file mode 100644 index 000000000..2060d2593 --- /dev/null +++ b/Lib/test/cjkencodings/gb18030-utf8.txt @@ -0,0 +1,15 @@ +Python(派森)语言是一种功能强大而完善的通用型计算机程序设计语言, +已经具有十多年的发展历史,成熟且稳定。这种语言具有非常简捷而清晰 +的语法特点,适合完成各种高层任务,几乎可以在所有的操作系统中 +运行。这种语言简单而强大,适合各种人士学习使用。目前,基于这 +种语言的相关技术正在飞速的发展,用户数量急剧扩大,相关的资源非常多。 +如何在 Python 中使用既有的 C library? + 在資訊科技快速發展的今天, 開發及測試軟體的速度是不容忽視的 +課題. 為加快開發及測試的速度, 我們便常希望能利用一些已開發好的 +library, 並有一個 fast prototyping 的 programming language 可 +供使用. 目前有許許多多的 library 是以 C 寫成, 而 Python 是一個 +fast prototyping 的 programming language. 故我們希望能將既有的 +C library 拿到 Python 的環境中測試及整合. 其中最主要也是我們所 +要討論的問題就是: +파이썬은 강력한 기능을 지닌 범용 컴퓨터 프로그래밍 언어다. + diff --git a/Lib/test/cjkencodings/gb18030.txt b/Lib/test/cjkencodings/gb18030.txt new file mode 100644 index 000000000..5d1f6dca2 --- /dev/null +++ b/Lib/test/cjkencodings/gb18030.txt @@ -0,0 +1,15 @@ +PythonɭһֹǿƵͨͼԣ +ѾʮķչʷȶԾзdzݶ +﷨ص㣬ʺɸָ߲񣬼еIJϵͳ +СԼ򵥶ǿʺϸʿѧϰʹáĿǰ +ԵؼڷٵķչûصԴdzࡣ + Python ʹüе C library? +YӍƼٰlչĽ, _lyԇܛwٶDzݺҕ +n}. ӿ_lyԇٶ, ҂㳣ϣһЩ_lõ +library, Kһ fast prototyping programming language +ʹ. ĿǰSS library C , Python һ +fast prototyping programming language. ҂ϣ܌е +C library õ Python ĭhМyԇ. ҪҲ҂ +ҪӑՓĆ}: +51332131 760463 858635 3195 0930 435755 5509899304 292599. + diff --git a/Lib/test/cjkencodings/gb2312-utf8.txt b/Lib/test/cjkencodings/gb2312-utf8.txt new file mode 100644 index 000000000..efb7d8f95 --- /dev/null +++ b/Lib/test/cjkencodings/gb2312-utf8.txt @@ -0,0 +1,6 @@ +Python(派森)语言是一种功能强大而完善的通用型计算机程序设计语言, +已经具有十多年的发展历史,成熟且稳定。这种语言具有非常简捷而清晰 +的语法特点,适合完成各种高层任务,几乎可以在所有的操作系统中 +运行。这种语言简单而强大,适合各种人士学习使用。目前,基于这 +种语言的相关技术正在飞速的发展,用户数量急剧扩大,相关的资源非常多。 + diff --git a/Lib/test/cjkencodings/gb2312.txt b/Lib/test/cjkencodings/gb2312.txt new file mode 100644 index 000000000..1536ac10b --- /dev/null +++ b/Lib/test/cjkencodings/gb2312.txt @@ -0,0 +1,6 @@ +PythonɭһֹǿƵͨͼԣ +ѾʮķչʷȶԾзdzݶ +﷨ص㣬ʺɸָ߲񣬼еIJϵͳ +СԼ򵥶ǿʺϸʿѧϰʹáĿǰ +ԵؼڷٵķչûصԴdzࡣ + diff --git a/Lib/test/cjkencodings/gbk-utf8.txt b/Lib/test/cjkencodings/gbk-utf8.txt new file mode 100644 index 000000000..75bbd31ec --- /dev/null +++ b/Lib/test/cjkencodings/gbk-utf8.txt @@ -0,0 +1,14 @@ +Python(派森)语言是一种功能强大而完善的通用型计算机程序设计语言, +已经具有十多年的发展历史,成熟且稳定。这种语言具有非常简捷而清晰 +的语法特点,适合完成各种高层任务,几乎可以在所有的操作系统中 +运行。这种语言简单而强大,适合各种人士学习使用。目前,基于这 +种语言的相关技术正在飞速的发展,用户数量急剧扩大,相关的资源非常多。 +如何在 Python 中使用既有的 C library? + 在資訊科技快速發展的今天, 開發及測試軟體的速度是不容忽視的 +課題. 為加快開發及測試的速度, 我們便常希望能利用一些已開發好的 +library, 並有一個 fast prototyping 的 programming language 可 +供使用. 目前有許許多多的 library 是以 C 寫成, 而 Python 是一個 +fast prototyping 的 programming language. 故我們希望能將既有的 +C library 拿到 Python 的環境中測試及整合. 其中最主要也是我們所 +要討論的問題就是: + diff --git a/Lib/test/cjkencodings/gbk.txt b/Lib/test/cjkencodings/gbk.txt new file mode 100644 index 000000000..8788f8a2d --- /dev/null +++ b/Lib/test/cjkencodings/gbk.txt @@ -0,0 +1,14 @@ +PythonɭһֹǿƵͨͼԣ +ѾʮķչʷȶԾзdzݶ +﷨ص㣬ʺɸָ߲񣬼еIJϵͳ +СԼ򵥶ǿʺϸʿѧϰʹáĿǰ +ԵؼڷٵķչûصԴdzࡣ + Python ʹüе C library? +YӍƼٰlչĽ, _lyԇܛwٶDzݺҕ +n}. ӿ_lyԇٶ, ҂㳣ϣһЩ_lõ +library, Kһ fast prototyping programming language +ʹ. ĿǰSS library C , Python һ +fast prototyping programming language. ҂ϣ܌е +C library õ Python ĭhМyԇ. ҪҲ҂ +ҪӑՓĆ}: + diff --git a/Lib/test/cjkencodings/hz-utf8.txt b/Lib/test/cjkencodings/hz-utf8.txt new file mode 100644 index 000000000..7c11735c1 --- /dev/null +++ b/Lib/test/cjkencodings/hz-utf8.txt @@ -0,0 +1,2 @@ +This sentence is in ASCII. +The next sentence is in GB.己所不欲,勿施於人。Bye. diff --git a/Lib/test/cjkencodings/hz.txt b/Lib/test/cjkencodings/hz.txt new file mode 100644 index 000000000..f882d4634 --- /dev/null +++ b/Lib/test/cjkencodings/hz.txt @@ -0,0 +1,2 @@ +This sentence is in ASCII. +The next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye. diff --git a/Lib/test/cjkencodings/iso2022_jp-utf8.txt b/Lib/test/cjkencodings/iso2022_jp-utf8.txt new file mode 100644 index 000000000..7763250eb --- /dev/null +++ b/Lib/test/cjkencodings/iso2022_jp-utf8.txt @@ -0,0 +1,7 @@ +Python の開発は、1990 年ごろから開始されています。 +開発者の Guido van Rossum は教育用のプログラミング言語「ABC」の開発に参加していましたが、ABC は実用上の目的にはあまり適していませんでした。 +このため、Guido はより実用的なプログラミング言語の開発を開始し、英国 BBS 放送のコメディ番組「モンティ パイソン」のファンである Guido はこの言語を「Python」と名づけました。 +このような背景から生まれた Python の言語設計は、「シンプル」で「習得が容易」という目標に重点が置かれています。 +多くのスクリプト系言語ではユーザの目先の利便性を優先して色々な機能を言語要素として取り入れる場合が多いのですが、Python ではそういった小細工が追加されることはあまりありません。 +言語自体の機能は最小限に押さえ、必要な機能は拡張モジュールとして追加する、というのが Python のポリシーです。 + diff --git a/Lib/test/cjkencodings/iso2022_jp.txt b/Lib/test/cjkencodings/iso2022_jp.txt new file mode 100644 index 000000000..fc398d64a --- /dev/null +++ b/Lib/test/cjkencodings/iso2022_jp.txt @@ -0,0 +1,7 @@ +Python $B$N3+H/$O!"(B1990 $BG/$4$m$+$i3+;O$5$l$F$$$^$9!#(B +$B3+H/e$NL\E*$K$O$"$^$jE,$7$F$$$^$;$s$G$7$?!#(B +$B$3$N$?$a!"(BGuido $B$O$h$j$E$1$^$7$?!#(B +$B$3$N$h$&$JGX7J$+$i@8$^$l$?(B Python $B$N8@8l@_7W$O!"!V%7%s%W%k!W$G!V=,F@$,MF0W!W$H$$$&L\I8$K=EE@$,CV$+$l$F$$$^$9!#(B +$BB?$/$N%9%/%j%W%H7O8@8l$G$O%f!<%6$NL\@h$NMxJX@-$rM%@h$7$F?'!9$J5!G=$r8@8lMWAG$H$7$Fl9g$,B?$$$N$G$9$,!"(BPython $B$G$O$=$&$$$C$?>.:Y9)$,DI2C$5$l$k$3$H$O$"$^$j$"$j$^$;$s!#(B +$B8@8l<+BN$N5!G=$O:G>.8B$K2!$5$(!"I,MW$J5!G=$O3HD%%b%8%e!<%k$H$7$FDI2C$9$k!"$H$$$&$N$,(B Python $B$N%]%j%7!<$G$9!#(B + diff --git a/Lib/test/cjkencodings/iso2022_kr-utf8.txt b/Lib/test/cjkencodings/iso2022_kr-utf8.txt new file mode 100644 index 000000000..d5c9d6eee --- /dev/null +++ b/Lib/test/cjkencodings/iso2022_kr-utf8.txt @@ -0,0 +1,7 @@ +◎ 파이썬(Python)은 배우기 쉽고, 강력한 프로그래밍 언어입니다. 파이썬은 +효율적인 고수준 데이터 구조와 간단하지만 효율적인 객체지향프로그래밍을 +지원합니다. 파이썬의 우아(優雅)한 문법과 동적 타이핑, 그리고 인터프리팅 +환경은 파이썬을 스크립팅과 여러 분야에서와 대부분의 플랫폼에서의 빠른 +애플리케이션 개발을 할 수 있는 이상적인 언어로 만들어줍니다. + +☆첫가끝: 날아라 쓩~ 큼! 금없이 전니다. 그런거 다. diff --git a/Lib/test/cjkencodings/iso2022_kr.txt b/Lib/test/cjkencodings/iso2022_kr.txt new file mode 100644 index 000000000..2cece21c5 --- /dev/null +++ b/Lib/test/cjkencodings/iso2022_kr.txt @@ -0,0 +1,7 @@ +$)C!] FD@L=c(Python)@: 9h?l1b =10m, 0-7BGQ GA7N1W7!9V >p>n@T4O4Y. FD@L=c@: +H?@2@{@N 0mF(iPd:)GQ 9.9}0z 5?@{ E8@LGN, 1W8.0m @NEMGA8.FC +H/0f@: FD@L=c@; =:E)83FC0z ?)7/ :P>_?!<-?M 4k:N:P@G GC7'F{?!<-@G :|8% +>VGC8.DI@Lp>n7N 885i>nA]4O4Y. + +!YC90!3!: 3/>F6s >1~ E-! 1]>x@L @|4O4Y. 1W710E 4Y. diff --git a/Lib/test/cjkencodings/johab-utf8.txt b/Lib/test/cjkencodings/johab-utf8.txt new file mode 100644 index 000000000..5655e3851 --- /dev/null +++ b/Lib/test/cjkencodings/johab-utf8.txt @@ -0,0 +1,9 @@ +똠방각하 펲시콜라 + +㉯㉯납!! 因九月패믤릔궈 ⓡⓖ훀¿¿¿ 긍뒙 ⓔ뎨 ㉯. . +亞영ⓔ능횹 . . . . 서울뤄 뎐학乙 家훀 ! ! !ㅠ.ㅠ +흐흐흐 ㄱㄱㄱ☆ㅠ_ㅠ 어릨 탸콰긐 뎌응 칑九들乙 ㉯드긐 +설릌 家훀 . . . . 굴애쉌 ⓔ궈 ⓡ릘㉱긐 因仁川女中까즼 +와쒀훀 ! ! 亞영ⓔ 家능궈 ☆上관 없능궈능 亞능뒈훀 글애듴 +ⓡ려듀九 싀풔숴훀 어릨 因仁川女中싁⑨들앜!! ㉯㉯납♡ ⌒⌒* + diff --git a/Lib/test/cjkencodings/johab.txt b/Lib/test/cjkencodings/johab.txt new file mode 100644 index 000000000..067781b78 --- /dev/null +++ b/Lib/test/cjkencodings/johab.txt @@ -0,0 +1,9 @@ +wba \ũa + +s!! gÚ zٯٯٯ w ѕ . . + 0: + myreplace.limit -= 1 + return ('REPLACED', 0) + else: + return ('TERMINAL', exc.end) + myreplace.limit = 3 + codecs.register_error("test.cjktest", myreplace) + self.assertEqual(self.encode('abcd' + self.unmappedunicode + 'efgh', + 'test.cjktest'), + (b'abcdREPLACEDabcdREPLACEDabcdREPLACEDabcdTERMINALefgh', 9)) + + def test_callback_forward_index(self): + def myreplace(exc): + return ('REPLACED', exc.end + 2) + codecs.register_error("test.cjktest", myreplace) + self.assertEqual(self.encode('abcd' + self.unmappedunicode + 'efgh', + 'test.cjktest'), (b'abcdREPLACEDgh', 9)) + + def test_callback_index_outofbound(self): + def myreplace(exc): + return ('TERM', 100) + codecs.register_error("test.cjktest", myreplace) + self.assertRaises(IndexError, self.encode, self.unmappedunicode, + 'test.cjktest') + + def test_incrementalencoder(self): + UTF8Reader = codecs.getreader('utf-8') + for sizehint in [None] + list(range(1, 33)) + \ + [64, 128, 256, 512, 1024]: + istream = UTF8Reader(BytesIO(self.tstring[1])) + ostream = BytesIO() + encoder = self.incrementalencoder() + while 1: + if sizehint is not None: + data = istream.read(sizehint) + else: + data = istream.read() + + if not data: + break + e = encoder.encode(data) + ostream.write(e) + + self.assertEqual(ostream.getvalue(), self.tstring[0]) + + def test_incrementaldecoder(self): + UTF8Writer = codecs.getwriter('utf-8') + for sizehint in [None, -1] + list(range(1, 33)) + \ + [64, 128, 256, 512, 1024]: + istream = BytesIO(self.tstring[0]) + ostream = UTF8Writer(BytesIO()) + decoder = self.incrementaldecoder() + while 1: + data = istream.read(sizehint) + if not data: + break + else: + u = decoder.decode(data) + ostream.write(u) + + self.assertEqual(ostream.getvalue(), self.tstring[1]) + + def test_incrementalencoder_error_callback(self): + inv = self.unmappedunicode + + e = self.incrementalencoder() + self.assertRaises(UnicodeEncodeError, e.encode, inv, True) + + e.errors = 'ignore' + self.assertEqual(e.encode(inv, True), b'') + + e.reset() + def tempreplace(exc): + return ('called', exc.end) + codecs.register_error('test.incremental_error_callback', tempreplace) + e.errors = 'test.incremental_error_callback' + self.assertEqual(e.encode(inv, True), b'called') + + # again + e.errors = 'ignore' + self.assertEqual(e.encode(inv, True), b'') + + def test_streamreader(self): + UTF8Writer = codecs.getwriter('utf-8') + for name in ["read", "readline", "readlines"]: + for sizehint in [None, -1] + list(range(1, 33)) + \ + [64, 128, 256, 512, 1024]: + istream = self.reader(BytesIO(self.tstring[0])) + ostream = UTF8Writer(BytesIO()) + func = getattr(istream, name) + while 1: + data = func(sizehint) + if not data: + break + if name == "readlines": + ostream.writelines(data) + else: + ostream.write(data) + + self.assertEqual(ostream.getvalue(), self.tstring[1]) + + def test_streamwriter(self): + readfuncs = ('read', 'readline', 'readlines') + UTF8Reader = codecs.getreader('utf-8') + for name in readfuncs: + for sizehint in [None] + list(range(1, 33)) + \ + [64, 128, 256, 512, 1024]: + istream = UTF8Reader(BytesIO(self.tstring[1])) + ostream = self.writer(BytesIO()) + func = getattr(istream, name) + while 1: + if sizehint is not None: + data = func(sizehint) + else: + data = func() + + if not data: + break + if name == "readlines": + ostream.writelines(data) + else: + ostream.write(data) + + self.assertEqual(ostream.getvalue(), self.tstring[0]) + + def test_streamwriter_reset_no_pending(self): + # Issue #23247: Calling reset() on a fresh StreamWriter instance + # (without pending data) must not crash + stream = BytesIO() + writer = self.writer(stream) + writer.reset() + + def test_incrementalencoder_del_segfault(self): + e = self.incrementalencoder() + with self.assertRaises(AttributeError): + del e.errors + + def test_null_terminator(self): + # see gh-101828 + text = "フルーツ" + try: + text.encode(self.encoding) + except UnicodeEncodeError: + text = "Python is cool" + encode_w_null = (text + "\0").encode(self.encoding) + encode_plus_null = text.encode(self.encoding) + "\0".encode(self.encoding) + self.assertTrue(encode_w_null.endswith(b'\x00')) + self.assertEqual(encode_w_null, encode_plus_null) + + encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding) + encode_plus_null_2 = encode_plus_null + encode_plus_null + self.assertEqual(encode_w_null_2.count(b'\x00'), 2) + self.assertEqual(encode_w_null_2, encode_plus_null_2) + + +class TestBase_Mapping(unittest.TestCase): + pass_enctest = [] + pass_dectest = [] + supmaps = [] + codectests = [] + + def setUp(self): + try: + self.open_mapping_file().close() # test it to report the error early + except (OSError, HTTPException): + self.skipTest("Could not retrieve "+self.mapfileurl) + + def open_mapping_file(self): + return support.open_urlresource(self.mapfileurl, encoding="utf-8") + + def test_mapping_file(self): + if self.mapfileurl.endswith('.xml'): + self._test_mapping_file_ucm() + else: + self._test_mapping_file_plain() + + def _test_mapping_file_plain(self): + def unichrs(s): + return ''.join(chr(int(x, 16)) for x in s.split('+')) + + urt_wa = {} + + with self.open_mapping_file() as f: + for line in f: + if not line: + break + data = line.split('#')[0].split() + if len(data) != 2: + continue + + if data[0][:2] != '0x': + self.fail(f"Invalid line: {line!r}") + csetch = bytes.fromhex(data[0][2:]) + if len(csetch) == 1 and 0x80 <= csetch[0]: + continue + + unich = unichrs(data[1]) + if ord(unich) == 0xfffd or unich in urt_wa: + continue + urt_wa[unich] = csetch + + self._testpoint(csetch, unich) + + def _test_mapping_file_ucm(self): + with self.open_mapping_file() as f: + ucmdata = f.read() + uc = re.findall('', ucmdata) + for uni, coded in uc: + unich = chr(int(uni, 16)) + codech = bytes.fromhex(coded) + self._testpoint(codech, unich) + + def test_mapping_supplemental(self): + for mapping in self.supmaps: + self._testpoint(*mapping) + + def _testpoint(self, csetch, unich): + if (csetch, unich) not in self.pass_enctest: + self.assertEqual(unich.encode(self.encoding), csetch) + if (csetch, unich) not in self.pass_dectest: + self.assertEqual(str(csetch, self.encoding), unich) + + def test_errorhandle(self): + for source, scheme, expected in self.codectests: + if isinstance(source, bytes): + func = source.decode + else: + func = source.encode + if expected: + if isinstance(source, bytes): + result = func(self.encoding, scheme) + self.assertTrue(type(result) is str, type(result)) + self.assertEqual(result, expected, + '%a.decode(%r, %r)=%a != %a' + % (source, self.encoding, scheme, result, + expected)) + else: + result = func(self.encoding, scheme) + self.assertTrue(type(result) is bytes, type(result)) + self.assertEqual(result, expected, + '%a.encode(%r, %r)=%a != %a' + % (source, self.encoding, scheme, result, + expected)) + else: + self.assertRaises(UnicodeError, func, self.encoding, scheme) + +def load_teststring(name): + dir = os.path.join(os.path.dirname(__file__), 'cjkencodings') + with open(os.path.join(dir, name + '.txt'), 'rb') as f: + encoded = f.read() + with open(os.path.join(dir, name + '-utf8.txt'), 'rb') as f: + utf8 = f.read() + return encoded, utf8 diff --git a/Lib/test/test_codecencodings_cn.py b/Lib/test/test_codecencodings_cn.py new file mode 100644 index 000000000..2a450718d --- /dev/null +++ b/Lib/test/test_codecencodings_cn.py @@ -0,0 +1,96 @@ +# +# test_codecencodings_cn.py +# Codec encoding tests for PRC encodings. +# + +from test import multibytecodec_support +import unittest + +class Test_GB2312(multibytecodec_support.TestBase, unittest.TestCase): + encoding = 'gb2312' + tstring = multibytecodec_support.load_teststring('gb2312') + codectests = ( + # invalid bytes + (b"abc\x81\x81\xc1\xc4", "strict", None), + (b"abc\xc8", "strict", None), + (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"), + (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"), + (b"abc\x81\x81\xc1\xc4", "ignore", "abc\u804a"), + (b"\xc1\x64", "strict", None), + ) + +class Test_GBK(multibytecodec_support.TestBase, unittest.TestCase): + encoding = 'gbk' + tstring = multibytecodec_support.load_teststring('gbk') + codectests = ( + # invalid bytes + (b"abc\x80\x80\xc1\xc4", "strict", None), + (b"abc\xc8", "strict", None), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"), + (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"), + (b"\x83\x34\x83\x31", "strict", None), + ("\u30fb", "strict", None), + ) + +class Test_GB18030(multibytecodec_support.TestBase, unittest.TestCase): + encoding = 'gb18030' + tstring = multibytecodec_support.load_teststring('gb18030') + codectests = ( + # invalid bytes + (b"abc\x80\x80\xc1\xc4", "strict", None), + (b"abc\xc8", "strict", None), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"), + (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"), + (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"), + ("\u30fb", "strict", b"\x819\xa79"), + (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'), + (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'), + (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'), + # issue29990 + (b"\xff\x30\x81\x30", "strict", None), + (b"\x81\x30\xff\x30", "strict", None), + (b"abc\x81\x39\xff\x39\xc1\xc4", "replace", "abc\ufffd\x39\ufffd\x39\u804a"), + (b"abc\xab\x36\xff\x30def", "replace", 'abc\ufffd\x36\ufffd\x30def'), + (b"abc\xbf\x38\xff\x32\xc1\xc4", "ignore", "abc\x38\x32\u804a"), + ) + has_iso10646 = True + +class Test_HZ(multibytecodec_support.TestBase, unittest.TestCase): + encoding = 'hz' + tstring = multibytecodec_support.load_teststring('hz') + codectests = ( + # test '~\n' (3 lines) + (b'This sentence is in ASCII.\n' + b'The next sentence is in GB.~{<:Ky2;S{#,~}~\n' + b'~{NpJ)l6HK!#~}Bye.\n', + 'strict', + 'This sentence is in ASCII.\n' + 'The next sentence is in GB.' + '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002' + 'Bye.\n'), + # test '~\n' (4 lines) + (b'This sentence is in ASCII.\n' + b'The next sentence is in GB.~\n' + b'~{<:Ky2;S{#,NpJ)l6HK!#~}~\n' + b'Bye.\n', + 'strict', + 'This sentence is in ASCII.\n' + 'The next sentence is in GB.' + '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002' + 'Bye.\n'), + # invalid bytes + (b'ab~cd', 'replace', 'ab\uFFFDcd'), + (b'ab\xffcd', 'replace', 'ab\uFFFDcd'), + (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'), + (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'), + (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"), + # issue 30003 + ('ab~cd', 'strict', b'ab~~cd'), # escape ~ + (b'~{Dc~~:C~}', 'strict', None), # ~~ only in ASCII mode + (b'~{Dc~\n:C~}', 'strict', None), # ~\n only in ASCII mode + ) + +if __name__ == "__main__": + unittest.main()