Compare commits

..

17 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
4efa5da5f6 Simplify unicode regex call sites
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/b894057a-9bed-4f35-8400-a5731c63602d

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
2026-04-06 02:07:56 +00:00
copilot-swe-agent[bot]
2934897035 Polish unicode follow-up review fixes
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
2026-04-05 08:54:41 +00:00
copilot-swe-agent[bot]
0a340de9c3 Finish unicode crate follow-up refactors
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
2026-04-05 08:53:23 +00:00
copilot-swe-agent[bot]
5cf1bd6667 Polish unicode regex helpers
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
2026-04-05 07:45:20 +00:00
copilot-swe-agent[bot]
e968d83808 Tidy shared unicode review feedback
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
2026-04-05 07:44:23 +00:00
copilot-swe-agent[bot]
67485b5b77 Extract shared unicode crate
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
2026-04-05 07:39:45 +00:00
copilot-swe-agent[bot]
d3af1c54ec Initial plan 2026-04-05 07:11:29 +00:00
Copilot
eed618d858 Fix str.zfill() width calculation for non-ASCII strings (#7534) 2026-04-04 14:45:30 +09:00
Bo Maryniuk
87fc4540c4 Fix VM's infinite recursion crash with musl libc (#7558)
* Fix VM's infinite recursion crash with musl libc

* Lintfix/cleanup warnings
2026-04-04 09:46:46 +09:00
dependabot[bot]
a09afab912 Bump lodash from 4.17.23 to 4.18.1 in /wasm/demo (#7556)
Bumps [lodash](https://github.com/lodash/lodash) from 4.17.23 to 4.18.1.
- [Release notes](https://github.com/lodash/lodash/releases)
- [Commits](https://github.com/lodash/lodash/compare/4.17.23...4.18.1)

---
updated-dependencies:
- dependency-name: lodash
  dependency-version: 4.18.1
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-04-03 01:44:50 +09:00
Shahar Naveh
3d9688402a Replace unmaintained unic crates (#7555) 2026-04-03 01:43:11 +09:00
Shahar Naveh
b61dfdc534 Updtae test_optimizer.py from 3.14.3 (#7554) 2026-04-03 01:33:50 +09:00
Shahar Naveh
6d7d74cc0b Update test_opcache.py from 3.14.3 (#7553) 2026-04-03 01:33:34 +09:00
Shahar Naveh
3f49f42702 Update test_module from 3.14.3 (#7552)
* Update `test_module` from 3.14.3

* Unmark passing test
2026-04-03 01:33:19 +09:00
Shahar Naveh
5afa3493a1 Add test_perfmaps.py from 3.14.3 (#7551) 2026-04-03 00:50:25 +09:00
Shahar Naveh
1adda8a73d Update test_file & test_largefile from 3.14.3 (#7550) 2026-04-03 00:49:55 +09:00
Shahar Naveh
344b7a5abd [zizmor] ignore superfluous-actions (#7548) 2026-04-03 00:44:58 +09:00
44 changed files with 2219 additions and 795 deletions

View File

@@ -50,10 +50,9 @@ jobs:
with: with:
persist-credentials: false persist-credentials: false
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@stable
with: with:
components: clippy components: clippy
toolchain: stable
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
with: with:
@@ -170,7 +169,9 @@ jobs:
restore-keys: | restore-keys: |
cargo-check-${{ runner.os }}-${{ matrix.target }}- cargo-check-${{ runner.os }}-${{ matrix.target }}-
- run: rustup toolchain install stable --target "${{ matrix.target }}" - uses: dtolnay/rust-toolchain@stable
with:
target: ${{ matrix.target }}
- name: Setup Android NDK - name: Setup Android NDK
if: ${{ matrix.target == 'aarch64-linux-android' }} if: ${{ matrix.target == 'aarch64-linux-android' }}
@@ -251,9 +252,7 @@ jobs:
with: with:
persist-credentials: false persist-credentials: false
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@stable
with:
toolchain: stable
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
with: with:
@@ -366,9 +365,8 @@ jobs:
with: with:
python-version: ${{ env.PYTHON_VERSION }} python-version: ${{ env.PYTHON_VERSION }}
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@stable
with: with:
toolchain: stable
components: rustfmt components: rustfmt
- uses: cargo-bins/cargo-binstall@113a77a4ce971c41332f2129c3d995df993cf746 # v1.17.8 - uses: cargo-bins/cargo-binstall@113a77a4ce971c41332f2129c3d995df993cf746 # v1.17.8
@@ -425,7 +423,7 @@ jobs:
with: with:
persist-credentials: false persist-credentials: false
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@master
with: with:
toolchain: ${{ env.NIGHTLY_CHANNEL }} toolchain: ${{ env.NIGHTLY_CHANNEL }}
components: miri components: miri
@@ -451,10 +449,9 @@ jobs:
with: with:
persist-credentials: false persist-credentials: false
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@stable
with: with:
components: clippy components: clippy
toolchain: stable
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
with: with:
@@ -529,10 +526,9 @@ jobs:
with: with:
persist-credentials: false persist-credentials: false
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@stable
with: with:
target: wasm32-wasip1 target: wasm32-wasip1
toolchain: stable
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
with: with:

View File

@@ -52,7 +52,7 @@ jobs:
with: with:
persist-credentials: false persist-credentials: false
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@stable
with: with:
target: ${{ matrix.target }} target: ${{ matrix.target }}
@@ -89,7 +89,7 @@ jobs:
with: with:
persist-credentials: false persist-credentials: false
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - uses: dtolnay/rust-toolchain@stable
with: with:
targets: wasm32-wasip1 targets: wasm32-wasip1

14
.github/zizmor.yml vendored Normal file
View File

@@ -0,0 +1,14 @@
rules:
unpinned-uses:
config:
policies:
# dtolnay/rust-toolchain is a trusted action that uses lightweight branch
# refs (@stable, @nightly, etc.) by design. Pinning to a hash would break
# the intended usage pattern.
# We can remove this once https://github.com/dtolnay/rust-toolchain/issues/180 is resolved
dtolnay/rust-toolchain: any
# dtolnay/rust-toolchain handles component installation, target addition, and
# override configuration beyond what a bare `rustup` invocation provides.
# See: https://github.com/zizmorcore/zizmor/issues/1817
superfluous-actions:
disable: true

482
Cargo.lock generated
View File

@@ -187,7 +187,7 @@ checksum = "3109e49b1e4909e9db6515a30c633684d68cdeaa252f215214cb4fa1a5bfee2c"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
"synstructure", "synstructure",
] ]
@@ -199,7 +199,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -222,7 +222,7 @@ dependencies = [
"manyhow", "manyhow",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -238,7 +238,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"quote-use", "quote-use",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -299,22 +299,22 @@ checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06"
[[package]] [[package]]
name = "bindgen" name = "bindgen"
version = "0.64.0" version = "0.71.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
dependencies = [ dependencies = [
"bitflags 1.3.2", "bitflags 2.11.0",
"cexpr", "cexpr",
"clang-sys", "clang-sys",
"lazy_static 1.5.0", "itertools 0.13.0",
"lazycell", "log",
"peeking_take_while", "prettyplease",
"proc-macro2", "proc-macro2",
"quote", "quote",
"regex", "regex",
"rustc-hash 1.1.0", "rustc-hash",
"shlex", "shlex",
"syn 1.0.109", "syn",
] ]
[[package]] [[package]]
@@ -332,9 +332,9 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"regex", "regex",
"rustc-hash 2.1.1", "rustc-hash",
"shlex", "shlex",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -778,7 +778,7 @@ dependencies = [
"libm", "libm",
"log", "log",
"regalloc2", "regalloc2",
"rustc-hash 2.1.1", "rustc-hash",
"serde", "serde",
"smallvec", "smallvec",
"target-lexicon", "target-lexicon",
@@ -1023,7 +1023,7 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1043,7 +1043,7 @@ checksum = "ef941ded77d15ca19b40374869ac6000af1c9f2a4c0f3d4c70926287e6364a8f"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1086,7 +1086,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1232,7 +1232,7 @@ checksum = "7693d9dd1ec1c54f52195dfe255b627f7cec7da33b679cd56de949e662b3db10"
dependencies = [ dependencies = [
"flame", "flame",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1333,7 +1333,7 @@ checksum = "f2b6d1e2f75c16bfbcd0f95d84f99858a6e2f885c2287d1f5c3a96e8444a34b4"
dependencies = [ dependencies = [
"attribute-derive", "attribute-derive",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1503,6 +1503,90 @@ dependencies = [
"cc", "cc",
] ]
[[package]]
name = "icu_collections"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
dependencies = [
"displaydoc",
"potential_utf",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locale_core"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_normalizer"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
dependencies = [
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
[[package]]
name = "icu_properties"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec"
dependencies = [
"icu_collections",
"icu_locale_core",
"icu_properties_data",
"icu_provider",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
[[package]]
name = "icu_provider"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
dependencies = [
"displaydoc",
"icu_locale_core",
"writeable",
"yoke",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]] [[package]]
name = "indexmap" name = "indexmap"
version = "2.13.0" version = "2.13.0"
@@ -1550,7 +1634,7 @@ dependencies = [
"heck", "heck",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1604,7 +1688,7 @@ checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1680,12 +1764,6 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "lazycell"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]] [[package]]
name = "lexical-parse-float" name = "lexical-parse-float"
version = "1.0.6" version = "1.0.6"
@@ -1830,6 +1908,12 @@ version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
[[package]]
name = "litemap"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.14" version = "0.4.14"
@@ -1930,7 +2014,7 @@ dependencies = [
"manyhow-macros", "manyhow-macros",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -1950,12 +2034,6 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "matches"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
[[package]] [[package]]
name = "md-5" name = "md-5"
version = "0.10.6" version = "0.10.6"
@@ -2131,7 +2209,7 @@ checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2193,7 +2271,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2262,8 +2340,7 @@ dependencies = [
[[package]] [[package]]
name = "parking_lot_core" name = "parking_lot_core"
version = "0.9.12" version = "0.9.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/youknowone/parking_lot?branch=rustpython#4392edbe879acc9c0dd94eda53d2205d3ab912c9"
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"libc", "libc",
@@ -2288,12 +2365,6 @@ dependencies = [
"hmac", "hmac",
] ]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
[[package]] [[package]]
name = "pem-rfc7468" name = "pem-rfc7468"
version = "0.7.0" version = "0.7.0"
@@ -2372,7 +2443,7 @@ dependencies = [
"phf_shared 0.13.1", "phf_shared 0.13.1",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2474,7 +2545,7 @@ checksum = "52a40bc70c2c58040d2d8b167ba9a5ff59fc9dab7ad44771cfde3dcfde7a09c6"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2492,6 +2563,15 @@ dependencies = [
"portable-atomic", "portable-atomic",
] ]
[[package]]
name = "potential_utf"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
dependencies = [
"zerovec",
]
[[package]] [[package]]
name = "powerfmt" name = "powerfmt"
version = "0.2.0" version = "0.2.0"
@@ -2514,7 +2594,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2603,7 +2683,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"pyo3-macros-backend", "pyo3-macros-backend",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2616,7 +2696,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"pyo3-build-config", "pyo3-build-config",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2647,7 +2727,7 @@ dependencies = [
"proc-macro-utils", "proc-macro-utils",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2797,7 +2877,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2810,7 +2890,7 @@ dependencies = [
"bumpalo", "bumpalo",
"hashbrown 0.15.5", "hashbrown 0.15.5",
"log", "log",
"rustc-hash 2.1.1", "rustc-hash",
"smallvec", "smallvec",
] ]
@@ -2873,7 +2953,7 @@ dependencies = [
"pmutil", "pmutil",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -2890,12 +2970,6 @@ dependencies = [
"windows-sys 0.52.0", "windows-sys 0.52.0",
] ]
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]] [[package]]
name = "rustc-hash" name = "rustc-hash"
version = "2.1.1" version = "2.1.1"
@@ -3049,9 +3123,9 @@ dependencies = [
"rustpython-ruff_python_ast", "rustpython-ruff_python_ast",
"rustpython-ruff_python_parser", "rustpython-ruff_python_parser",
"rustpython-ruff_text_size", "rustpython-ruff_text_size",
"rustpython-unicode",
"rustpython-wtf8", "rustpython-wtf8",
"thiserror 2.0.18", "thiserror 2.0.18",
"unicode_names2 2.0.0",
] ]
[[package]] [[package]]
@@ -3074,9 +3148,9 @@ dependencies = [
"parking_lot", "parking_lot",
"radium", "radium",
"rustpython-literal", "rustpython-literal",
"rustpython-unicode",
"rustpython-wtf8", "rustpython-wtf8",
"siphasher", "siphasher",
"unicode_names2 2.0.0",
"widestring", "widestring",
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
@@ -3122,7 +3196,7 @@ version = "0.5.0"
dependencies = [ dependencies = [
"rustpython-compiler", "rustpython-compiler",
"rustpython-derive-impl", "rustpython-derive-impl",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -3135,7 +3209,7 @@ dependencies = [
"quote", "quote",
"rustpython-compiler-core", "rustpython-compiler-core",
"rustpython-doc", "rustpython-doc",
"syn 2.0.117", "syn",
"syn-ext", "syn-ext",
"textwrap", "textwrap",
] ]
@@ -3172,8 +3246,8 @@ dependencies = [
"lexical-parse-float", "lexical-parse-float",
"num-traits", "num-traits",
"rand 0.9.2", "rand 0.9.2",
"rustpython-unicode",
"rustpython-wtf8", "rustpython-wtf8",
"unic-ucd-category",
] ]
[[package]] [[package]]
@@ -3197,7 +3271,7 @@ dependencies = [
"get-size2", "get-size2",
"is-macro", "is-macro",
"memchr", "memchr",
"rustc-hash 2.1.1", "rustc-hash",
"rustpython-ruff_python_trivia", "rustpython-ruff_python_trivia",
"rustpython-ruff_source_file", "rustpython-ruff_source_file",
"rustpython-ruff_text_size", "rustpython-ruff_text_size",
@@ -3215,7 +3289,7 @@ dependencies = [
"compact_str", "compact_str",
"get-size2", "get-size2",
"memchr", "memchr",
"rustc-hash 2.1.1", "rustc-hash",
"rustpython-ruff_python_ast", "rustpython-ruff_python_ast",
"rustpython-ruff_python_trivia", "rustpython-ruff_python_trivia",
"rustpython-ruff_text_size", "rustpython-ruff_text_size",
@@ -3264,6 +3338,7 @@ dependencies = [
"criterion", "criterion",
"num_enum", "num_enum",
"optional", "optional",
"rustpython-unicode",
"rustpython-wtf8", "rustpython-wtf8",
] ]
@@ -3335,6 +3410,7 @@ dependencies = [
"rustpython-ruff_python_parser", "rustpython-ruff_python_parser",
"rustpython-ruff_source_file", "rustpython-ruff_source_file",
"rustpython-ruff_text_size", "rustpython-ruff_text_size",
"rustpython-unicode",
"rustpython-vm", "rustpython-vm",
"schannel", "schannel",
"sha-1", "sha-1",
@@ -3345,14 +3421,6 @@ dependencies = [
"tcl-sys", "tcl-sys",
"termios", "termios",
"tk-sys", "tk-sys",
"ucd",
"unic-char-property",
"unic-normal",
"unic-ucd-age",
"unic-ucd-bidi",
"unic-ucd-category",
"unicode-bidi-mirroring",
"unicode_names2 2.0.0",
"uuid", "uuid",
"webpki-roots", "webpki-roots",
"widestring", "widestring",
@@ -3362,6 +3430,21 @@ dependencies = [
"xml", "xml",
] ]
[[package]]
name = "rustpython-unicode"
version = "0.5.0"
dependencies = [
"caseless",
"icu_normalizer",
"icu_properties",
"itertools 0.14.0",
"rustpython-wtf8",
"ucd",
"unic-ucd-age",
"unicode-casing",
"unicode_names2 2.0.0",
]
[[package]] [[package]]
name = "rustpython-venvlauncher" name = "rustpython-venvlauncher"
version = "0.5.0" version = "0.5.0"
@@ -3374,7 +3457,6 @@ dependencies = [
"ascii", "ascii",
"bitflags 2.11.0", "bitflags 2.11.0",
"bstr", "bstr",
"caseless",
"cfg-if", "cfg-if",
"chrono", "chrono",
"constant_time_eq", "constant_time_eq",
@@ -3420,6 +3502,7 @@ dependencies = [
"rustpython-ruff_python_parser", "rustpython-ruff_python_parser",
"rustpython-ruff_text_size", "rustpython-ruff_text_size",
"rustpython-sre_engine", "rustpython-sre_engine",
"rustpython-unicode",
"rustyline", "rustyline",
"scoped-tls", "scoped-tls",
"scopeguard", "scopeguard",
@@ -3430,9 +3513,6 @@ dependencies = [
"thiserror 2.0.18", "thiserror 2.0.18",
"timsort", "timsort",
"uname", "uname",
"unic-ucd-bidi",
"unic-ucd-category",
"unic-ucd-ident",
"unicode-casing", "unicode-casing",
"wasm-bindgen", "wasm-bindgen",
"which", "which",
@@ -3621,7 +3701,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -3689,6 +3769,14 @@ dependencies = [
"keccak", "keccak",
] ]
[[package]]
name = "shared-build"
version = "0.2.0"
source = "git+https://github.com/arihant2math/tkinter.git?tag=v0.2.0#198fc35b1f18f4eda401f97a641908f321b1403a"
dependencies = [
"bindgen 0.71.1",
]
[[package]] [[package]]
name = "shlex" name = "shlex"
version = "1.3.0" version = "1.3.0"
@@ -3781,7 +3869,7 @@ dependencies = [
"heck", "heck",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -3790,17 +3878,6 @@ version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.117" version = "2.0.117"
@@ -3820,7 +3897,7 @@ checksum = "b126de4ef6c2a628a68609dd00733766c3b015894698a438ebdf374933fc31d1"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -3831,7 +3908,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -3864,14 +3941,10 @@ checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
[[package]] [[package]]
name = "tcl-sys" name = "tcl-sys"
version = "0.2.0" version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/arihant2math/tkinter.git?tag=v0.2.0#198fc35b1f18f4eda401f97a641908f321b1403a"
checksum = "081cd46ee0f9c78ab8ab54953239f7a2202f3efe1743e726b7b177d64c766cc0"
dependencies = [ dependencies = [
"anyhow",
"bindgen 0.64.0",
"jobserver",
"libc",
"pkg-config", "pkg-config",
"shared-build",
] ]
[[package]] [[package]]
@@ -3928,7 +4001,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -3939,7 +4012,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -3990,6 +4063,16 @@ version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "639ce8ef6d2ba56be0383a94dd13b92138d58de44c62618303bb798fa92bdc00" checksum = "639ce8ef6d2ba56be0383a94dd13b92138d58de44c62618303bb798fa92bdc00"
[[package]]
name = "tinystr"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
dependencies = [
"displaydoc",
"zerovec",
]
[[package]] [[package]]
name = "tinytemplate" name = "tinytemplate"
version = "1.2.1" version = "1.2.1"
@@ -4018,14 +4101,10 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]] [[package]]
name = "tk-sys" name = "tk-sys"
version = "0.2.0" version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/arihant2math/tkinter.git?tag=v0.2.0#198fc35b1f18f4eda401f97a641908f321b1403a"
checksum = "963faa16744bacdab52b5a4ab049e264b020f20f73bdf0b5cfbfa8b9a8a8f8b7"
dependencies = [ dependencies = [
"bindgen 0.64.0",
"libc",
"pkg-config", "pkg-config",
"tcl-sys", "shared-build",
"x11",
] ]
[[package]] [[package]]
@@ -4046,7 +4125,7 @@ checksum = "2d2e76690929402faae40aebdda620a2c0e25dd6d3b9afe48867dfd95991f4bd"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -4136,15 +4215,6 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
[[package]]
name = "unic-normal"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f09d64d33589a94628bc2aeb037f35c2e25f3f049c7348b5aa5580b48e6bba62"
dependencies = [
"unic-ucd-normal",
]
[[package]] [[package]]
name = "unic-ucd-age" name = "unic-ucd-age"
version = "0.9.0" version = "0.9.0"
@@ -4156,61 +4226,6 @@ dependencies = [
"unic-ucd-version", "unic-ucd-version",
] ]
[[package]]
name = "unic-ucd-bidi"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1d568b51222484e1f8209ce48caa6b430bf352962b877d592c29ab31fb53d8c"
dependencies = [
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]
[[package]]
name = "unic-ucd-category"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b8d4591f5fcfe1bd4453baaf803c40e1b1e69ff8455c47620440b46efef91c0"
dependencies = [
"matches",
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]
[[package]]
name = "unic-ucd-hangul"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb1dc690e19010e1523edb9713224cba5ef55b54894fe33424439ec9a40c0054"
dependencies = [
"unic-ucd-version",
]
[[package]]
name = "unic-ucd-ident"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e230a37c0381caa9219d67cf063aa3a375ffed5bf541a452db16e744bdab6987"
dependencies = [
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]
[[package]]
name = "unic-ucd-normal"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86aed873b8202d22b13859dda5fe7c001d271412c31d411fd9b827e030569410"
dependencies = [
"unic-char-property",
"unic-char-range",
"unic-ucd-hangul",
"unic-ucd-version",
]
[[package]] [[package]]
name = "unic-ucd-version" name = "unic-ucd-version"
version = "0.9.0" version = "0.9.0"
@@ -4220,12 +4235,6 @@ dependencies = [
"unic-common", "unic-common",
] ]
[[package]]
name = "unicode-bidi-mirroring"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfa6e8c60bb66d49db113e0125ee8711b7647b5579dc7f5f19c42357ed039fe"
[[package]] [[package]]
name = "unicode-casing" name = "unicode-casing"
version = "0.1.1" version = "0.1.1"
@@ -4313,6 +4322,18 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]] [[package]]
name = "utf8parse" name = "utf8parse"
version = "0.2.2" version = "0.2.2"
@@ -4413,7 +4434,7 @@ dependencies = [
"bumpalo", "bumpalo",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
@@ -4553,7 +4574,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -4564,7 +4585,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
] ]
[[package]] [[package]]
@@ -4845,14 +4866,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
[[package]] [[package]]
name = "x11" name = "write16"
version = "2.21.0" version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "502da5464ccd04011667b11c435cb992822c2c0dbde1770c988480d312a0db2e" checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
dependencies = [
"libc", [[package]]
"pkg-config", name = "writeable"
] version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]] [[package]]
name = "x509-cert" name = "x509-cert"
@@ -4891,6 +4914,29 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8aa498d22c9bbaf482329839bc5620c46be275a19a812e9a22a2b07529a642a" checksum = "b8aa498d22c9bbaf482329839bc5620c46be275a19a812e9a22a2b07529a642a"
[[package]]
name = "yoke"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
dependencies = [
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]] [[package]]
name = "zerocopy" name = "zerocopy"
version = "0.8.34" version = "0.8.34"
@@ -4908,7 +4954,28 @@ checksum = "d8187381b52e32220d50b255276aa16a084ec0a9017a0ca2152a1f55c539758d"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
]
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
] ]
[[package]] [[package]]
@@ -4928,7 +4995,40 @@ checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn",
]
[[package]]
name = "zerotrie"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
]
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
dependencies = [
"proc-macro2",
"quote",
"syn",
] ]
[[package]] [[package]]

View File

@@ -101,7 +101,7 @@ opt-level = 3
lto = "thin" lto = "thin"
[patch.crates-io] [patch.crates-io]
# parking_lot_core = { git = "https://github.com/youknowone/parking_lot", branch = "rustpython" } parking_lot_core = { git = "https://github.com/youknowone/parking_lot", branch = "rustpython" }
# REDOX START, Uncomment when you want to compile/check with redoxer # REDOX START, Uncomment when you want to compile/check with redoxer
# REDOX END # REDOX END
@@ -153,6 +153,7 @@ rustpython-vm = { path = "crates/vm", default-features = false, version = "0.5.0
rustpython-pylib = { path = "crates/pylib", version = "0.5.0" } rustpython-pylib = { path = "crates/pylib", version = "0.5.0" }
rustpython-stdlib = { path = "crates/stdlib", default-features = false, version = "0.5.0" } rustpython-stdlib = { path = "crates/stdlib", default-features = false, version = "0.5.0" }
rustpython-sre_engine = { path = "crates/sre_engine", version = "0.5.0" } rustpython-sre_engine = { path = "crates/sre_engine", version = "0.5.0" }
rustpython-unicode = { path = "crates/unicode", default-features = false, version = "0.5.0" }
rustpython-wtf8 = { path = "crates/wtf8", version = "0.5.0" } rustpython-wtf8 = { path = "crates/wtf8", version = "0.5.0" }
rustpython-doc = { path = "crates/doc", version = "0.5.0" } rustpython-doc = { path = "crates/doc", version = "0.5.0" }
@@ -222,15 +223,11 @@ strum = "0.28"
strum_macros = "0.28" strum_macros = "0.28"
syn = "2" syn = "2"
thiserror = "2.0" thiserror = "2.0"
icu_properties = "2"
icu_normalizer = "2"
unicode-casing = "0.1.1" unicode-casing = "0.1.1"
unic-char-property = "0.9.0"
unic-normal = "0.9.0"
unic-ucd-age = "0.9.0" unic-ucd-age = "0.9.0"
unic-ucd-bidi = "0.9.0"
unic-ucd-category = "0.9.0"
unic-ucd-ident = "0.9.0"
unicode_names2 = "2.0.0" unicode_names2 = "2.0.0"
unicode-bidi-mirroring = "0.4"
widestring = "1.2.0" widestring = "1.2.0"
windows-sys = "0.61.2" windows-sys = "0.61.2"
wasm-bindgen = "0.2.106" wasm-bindgen = "0.2.106"

13
Lib/test/test_file.py vendored
View File

@@ -126,7 +126,7 @@ class AutoFileTests:
# it must also return None if an exception was given # it must also return None if an exception was given
try: try:
1/0 1/0
except: except ZeroDivisionError:
self.assertEqual(self.f.__exit__(*sys.exc_info()), None) self.assertEqual(self.f.__exit__(*sys.exc_info()), None)
def testReadWhenWriting(self): def testReadWhenWriting(self):
@@ -216,6 +216,16 @@ class OtherFileTests:
with self.assertWarnsRegex(RuntimeWarning, 'line buffering'): with self.assertWarnsRegex(RuntimeWarning, 'line buffering'):
self._checkBufferSize(1) self._checkBufferSize(1)
def testDefaultBufferSize(self):
with self.open(TESTFN, 'wb') as f:
blksize = f.raw._blksize
f.write(b"\0" * 5_000_000)
with self.open(TESTFN, 'rb') as f:
data = f.read1()
expected_size = max(min(blksize, 8192 * 1024), io.DEFAULT_BUFFER_SIZE)
self.assertEqual(len(data), expected_size)
def testTruncateOnWindows(self): def testTruncateOnWindows(self):
# SF bug <https://bugs.python.org/issue801631> # SF bug <https://bugs.python.org/issue801631>
# "file.truncate fault on windows" # "file.truncate fault on windows"
@@ -344,7 +354,6 @@ class OtherFileTests:
class COtherFileTests(OtherFileTests, unittest.TestCase): class COtherFileTests(OtherFileTests, unittest.TestCase):
open = io.open open = io.open
class PyOtherFileTests(OtherFileTests, unittest.TestCase): class PyOtherFileTests(OtherFileTests, unittest.TestCase):
open = staticmethod(pyio.open) open = staticmethod(pyio.open)

View File

@@ -2,13 +2,12 @@
""" """
import os import os
import stat
import sys import sys
import unittest import unittest
import socket import socket
import shutil import shutil
import threading import threading
from test.support import requires, bigmemtest from test.support import requires, bigmemtest, requires_resource
from test.support import SHORT_TIMEOUT from test.support import SHORT_TIMEOUT
from test.support import socket_helper from test.support import socket_helper
from test.support.os_helper import TESTFN, unlink from test.support.os_helper import TESTFN, unlink
@@ -29,7 +28,7 @@ class LargeFileTest:
mode = 'w+b' mode = 'w+b'
with self.open(TESTFN, mode) as f: with self.open(TESTFN, mode) as f:
current_size = os.fstat(f.fileno())[stat.ST_SIZE] current_size = os.fstat(f.fileno()).st_size
if current_size == size+1: if current_size == size+1:
return return
@@ -40,13 +39,13 @@ class LargeFileTest:
f.seek(size) f.seek(size)
f.write(b'a') f.write(b'a')
f.flush() f.flush()
self.assertEqual(os.fstat(f.fileno())[stat.ST_SIZE], size+1) self.assertEqual(os.fstat(f.fileno()).st_size, size+1)
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
with cls.open(TESTFN, 'wb'): with cls.open(TESTFN, 'wb'):
pass pass
if not os.stat(TESTFN)[stat.ST_SIZE] == 0: if not os.stat(TESTFN).st_size == 0:
raise cls.failureException('File was not truncated by opening ' raise cls.failureException('File was not truncated by opening '
'with mode "wb"') 'with mode "wb"')
unlink(TESTFN2) unlink(TESTFN2)
@@ -67,7 +66,7 @@ class TestFileMethods(LargeFileTest):
self.assertEqual(f.tell(), size + 1) self.assertEqual(f.tell(), size + 1)
def test_osstat(self): def test_osstat(self):
self.assertEqual(os.stat(TESTFN)[stat.ST_SIZE], size+1) self.assertEqual(os.stat(TESTFN).st_size, size+1)
def test_seek_read(self): def test_seek_read(self):
with self.open(TESTFN, 'rb') as f: with self.open(TESTFN, 'rb') as f:
@@ -142,6 +141,9 @@ class TestFileMethods(LargeFileTest):
f.truncate(1) f.truncate(1)
self.assertEqual(f.tell(), 0) # else pointer moved self.assertEqual(f.tell(), 0) # else pointer moved
f.seek(0) f.seek(0)
# Verify readall on a truncated file is well behaved. read()
# without a size can be unbounded, this should get just the byte
# that remains.
self.assertEqual(len(f.read()), 1) # else wasn't truncated self.assertEqual(len(f.read()), 1) # else wasn't truncated
def test_seekable(self): def test_seekable(self):
@@ -152,6 +154,22 @@ class TestFileMethods(LargeFileTest):
f.seek(pos) f.seek(pos)
self.assertTrue(f.seekable()) self.assertTrue(f.seekable())
@bigmemtest(size=size, memuse=2, dry_run=False)
def test_seek_readall(self, _size):
# Seek which doesn't change position should readall successfully.
with self.open(TESTFN, 'rb') as f:
self.assertEqual(f.seek(0, os.SEEK_CUR), 0)
self.assertEqual(len(f.read()), size + 1)
# Seek which changes (or might change) position should readall
# successfully.
with self.open(TESTFN, 'rb') as f:
self.assertEqual(f.seek(20, os.SEEK_SET), 20)
self.assertEqual(len(f.read()), size - 19)
with self.open(TESTFN, 'rb') as f:
self.assertEqual(f.seek(-3, os.SEEK_END), size - 2)
self.assertEqual(len(f.read()), 3)
def skip_no_disk_space(path, required): def skip_no_disk_space(path, required):
def decorator(fun): def decorator(fun):
@@ -173,6 +191,7 @@ class TestCopyfile(LargeFileTest, unittest.TestCase):
# Exact required disk space would be (size * 2), but let's give it a # Exact required disk space would be (size * 2), but let's give it a
# bit more tolerance. # bit more tolerance.
@skip_no_disk_space(TESTFN, size * 2.5) @skip_no_disk_space(TESTFN, size * 2.5)
@requires_resource('cpu')
def test_it(self): def test_it(self):
# Internally shutil.copyfile() can use "fast copy" methods like # Internally shutil.copyfile() can use "fast copy" methods like
# os.sendfile(). # os.sendfile().
@@ -222,6 +241,7 @@ class TestSocketSendfile(LargeFileTest, unittest.TestCase):
# Exact required disk space would be (size * 2), but let's give it a # Exact required disk space would be (size * 2), but let's give it a
# bit more tolerance. # bit more tolerance.
@skip_no_disk_space(TESTFN, size * 2.5) @skip_no_disk_space(TESTFN, size * 2.5)
@requires_resource('cpu')
def test_it(self): def test_it(self):
port = socket_helper.find_unused_port() port = socket_helper.find_unused_port()
with socket.create_server(("", port)) as sock: with socket.create_server(("", port)) as sock:

View File

@@ -1,4 +1,5 @@
# Test the module type # Test the module type
import importlib.machinery
import unittest import unittest
import weakref import weakref
from test.support import gc_collect from test.support import gc_collect
@@ -29,7 +30,7 @@ class ModuleTests(unittest.TestCase):
self.fail("__name__ = %s" % repr(s)) self.fail("__name__ = %s" % repr(s))
except AttributeError: except AttributeError:
pass pass
self.assertEqual(foo.__doc__, ModuleType.__doc__) self.assertEqual(foo.__doc__, ModuleType.__doc__ or '')
def test_uninitialized_missing_getattr(self): def test_uninitialized_missing_getattr(self):
# Issue 8297 # Issue 8297
@@ -102,8 +103,7 @@ class ModuleTests(unittest.TestCase):
gc_collect() gc_collect()
self.assertEqual(f().__dict__["bar"], 4) self.assertEqual(f().__dict__["bar"], 4)
# TODO: RUSTPYTHON @unittest.expectedFailure # TODO: RUSTPYTHON
@unittest.expectedFailure
def test_clear_dict_in_ref_cycle(self): def test_clear_dict_in_ref_cycle(self):
destroyed = [] destroyed = []
m = ModuleType("foo") m = ModuleType("foo")
@@ -152,15 +152,13 @@ a = A(destroyed)"""
if 'test.test_module.bad_getattr2' in sys.modules: if 'test.test_module.bad_getattr2' in sys.modules:
del sys.modules['test.test_module.bad_getattr2'] del sys.modules['test.test_module.bad_getattr2']
# TODO: RUSTPYTHON @unittest.expectedFailure # TODO: RUSTPYTHON
@unittest.expectedFailure
def test_module_dir(self): def test_module_dir(self):
import test.test_module.good_getattr as gga import test.test_module.good_getattr as gga
self.assertEqual(dir(gga), ['a', 'b', 'c']) self.assertEqual(dir(gga), ['a', 'b', 'c'])
del sys.modules['test.test_module.good_getattr'] del sys.modules['test.test_module.good_getattr']
# TODO: RUSTPYTHON @unittest.expectedFailure # TODO: RUSTPYTHON
@unittest.expectedFailure
def test_module_dir_errors(self): def test_module_dir_errors(self):
import test.test_module.bad_getattr as bga import test.test_module.bad_getattr as bga
from test.test_module import bad_getattr2 from test.test_module import bad_getattr2
@@ -270,11 +268,38 @@ a = A(destroyed)"""
self.assertEqual(r[-len(ends_with):], ends_with, self.assertEqual(r[-len(ends_with):], ends_with,
'{!r} does not end with {!r}'.format(r, ends_with)) '{!r} does not end with {!r}'.format(r, ends_with))
# TODO: RUSTPYTHON def test_module_repr_with_namespace_package(self):
@unittest.expectedFailure m = ModuleType('foo')
loader = importlib.machinery.NamespaceLoader('foo', ['bar'], 'baz')
spec = importlib.machinery.ModuleSpec('foo', loader)
m.__loader__ = loader
m.__spec__ = spec
self.assertEqual(repr(m), "<module 'foo' (namespace) from ['bar']>")
def test_module_repr_with_namespace_package_and_custom_loader(self):
m = ModuleType('foo')
loader = BareLoader()
spec = importlib.machinery.ModuleSpec('foo', loader)
m.__loader__ = loader
m.__spec__ = spec
expected_repr_pattern = r"<module 'foo' \(<.*\.BareLoader object at .+>\)>"
self.assertRegex(repr(m), expected_repr_pattern)
self.assertNotIn('from', repr(m))
def test_module_repr_with_fake_namespace_package(self):
m = ModuleType('foo')
loader = BareLoader()
loader._path = ['spam']
spec = importlib.machinery.ModuleSpec('foo', loader)
m.__loader__ = loader
m.__spec__ = spec
expected_repr_pattern = r"<module 'foo' \(<.*\.BareLoader object at .+>\)>"
self.assertRegex(repr(m), expected_repr_pattern)
self.assertNotIn('from', repr(m))
def test_module_finalization_at_shutdown(self): def test_module_finalization_at_shutdown(self):
# Module globals and builtins should still be available during shutdown # Module globals and builtins should still be available during shutdown
rc, out, err = assert_python_ok("-c", "from test import final_a") rc, out, err = assert_python_ok("-c", "from test.test_module import final_a")
self.assertFalse(err) self.assertFalse(err)
lines = out.splitlines() lines = out.splitlines()
self.assertEqual(set(lines), { self.assertEqual(set(lines), {

19
Lib/test/test_module/final_a.py vendored Normal file
View File

@@ -0,0 +1,19 @@
"""
Fodder for module finalization tests in test_module.
"""
import shutil
import test.test_module.final_b
x = 'a'
class C:
def __del__(self):
# Inspect module globals and builtins
print("x =", x)
print("final_b.x =", test.test_module.final_b.x)
print("shutil.rmtree =", getattr(shutil.rmtree, '__name__', None))
print("len =", getattr(len, '__name__', None))
c = C()
_underscored = C()

19
Lib/test/test_module/final_b.py vendored Normal file
View File

@@ -0,0 +1,19 @@
"""
Fodder for module finalization tests in test_module.
"""
import shutil
import test.test_module.final_a
x = 'b'
class C:
def __del__(self):
# Inspect module globals and builtins
print("x =", x)
print("final_a.x =", test.test_module.final_a.x)
print("shutil.rmtree =", getattr(shutil.rmtree, '__name__', None))
print("len =", getattr(len, '__name__', None))
c = C()
_underscored = C()

File diff suppressed because it is too large Load Diff

90
Lib/test/test_optimizer.py vendored Normal file
View File

@@ -0,0 +1,90 @@
import unittest
import types
from test.support import import_helper
_testinternalcapi = import_helper.import_module("_testinternalcapi")
class TestRareEventCounters(unittest.TestCase):
def setUp(self):
_testinternalcapi.reset_rare_event_counters()
def test_set_class(self):
class A:
pass
class B:
pass
a = A()
orig_counter = _testinternalcapi.get_rare_event_counters()["set_class"]
a.__class__ = B
self.assertEqual(
orig_counter + 1,
_testinternalcapi.get_rare_event_counters()["set_class"]
)
def test_set_bases(self):
class A:
pass
class B:
pass
class C(B):
pass
orig_counter = _testinternalcapi.get_rare_event_counters()["set_bases"]
C.__bases__ = (A,)
self.assertEqual(
orig_counter + 1,
_testinternalcapi.get_rare_event_counters()["set_bases"]
)
def test_set_eval_frame_func(self):
orig_counter = _testinternalcapi.get_rare_event_counters()["set_eval_frame_func"]
_testinternalcapi.set_eval_frame_record([])
self.assertEqual(
orig_counter + 1,
_testinternalcapi.get_rare_event_counters()["set_eval_frame_func"]
)
_testinternalcapi.set_eval_frame_default()
def test_builtin_dict(self):
orig_counter = _testinternalcapi.get_rare_event_counters()["builtin_dict"]
if isinstance(__builtins__, types.ModuleType):
builtins = __builtins__.__dict__
else:
builtins = __builtins__
builtins["FOO"] = 42
self.assertEqual(
orig_counter + 1,
_testinternalcapi.get_rare_event_counters()["builtin_dict"]
)
del builtins["FOO"]
def test_func_modification(self):
def func(x=0):
pass
for attribute in (
"__code__",
"__defaults__",
"__kwdefaults__"
):
orig_counter = _testinternalcapi.get_rare_event_counters()["func_modification"]
setattr(func, attribute, getattr(func, attribute))
self.assertEqual(
orig_counter + 1,
_testinternalcapi.get_rare_event_counters()["func_modification"]
)
class TestOptimizerSymbols(unittest.TestCase):
@unittest.skipUnless(hasattr(_testinternalcapi, "uop_symbols_test"),
"requires _testinternalcapi.uop_symbols_test")
def test_optimizer_symbols(self):
_testinternalcapi.uop_symbols_test()
if __name__ == "__main__":
unittest.main()

23
Lib/test/test_perfmaps.py vendored Normal file
View File

@@ -0,0 +1,23 @@
import os
import sys
import unittest
try:
from _testinternalcapi import perf_map_state_teardown, write_perf_map_entry
except ImportError:
raise unittest.SkipTest("requires _testinternalcapi")
if sys.platform != 'linux':
raise unittest.SkipTest('Linux only')
class TestPerfMapWriting(unittest.TestCase):
def test_write_perf_map_entry(self):
self.assertEqual(write_perf_map_entry(0x1234, 5678, "entry1"), 0)
self.assertEqual(write_perf_map_entry(0x2345, 6789, "entry2"), 0)
with open(f"/tmp/perf-{os.getpid()}.map") as f:
perf_file_contents = f.read()
self.assertIn("1234 162e entry1", perf_file_contents)
self.assertIn("2345 1a85 entry2", perf_file_contents)
perf_map_state_teardown()

View File

@@ -854,6 +854,7 @@ class StrTest(string_tests.StringLikeTest,
self.assertTrue('\U0001F46F'.isprintable()) self.assertTrue('\U0001F46F'.isprintable())
self.assertFalse('\U000E0020'.isprintable()) self.assertFalse('\U000E0020'.isprintable())
@unittest.expectedFailure # TODO: RUSTPYTHON
@support.requires_resource('cpu') @support.requires_resource('cpu')
def test_isprintable_invariant(self): def test_isprintable_invariant(self):
for codepoint in range(sys.maxunicode + 1): for codepoint in range(sys.maxunicode + 1):

View File

@@ -232,7 +232,6 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
b = 'C\u0338' * 20 + '\xC7' b = 'C\u0338' * 20 + '\xC7'
self.assertEqual(self.db.normalize('NFC', a), b) self.assertEqual(self.db.normalize('NFC', a), b)
@unittest.expectedFailure # TODO: RUSTPYTHON; ? +
def test_issue29456(self): def test_issue29456(self):
# Fix #29456 # Fix #29456
u1176_str_a = '\u1100\u1176\u11a8' u1176_str_a = '\u1100\u1176\u11a8'
@@ -389,6 +388,7 @@ class NormalizationTest(unittest.TestCase):
data = [int(x, 16) for x in data.split(" ")] data = [int(x, 16) for x in data.split(" ")]
return "".join([chr(x) for x in data]) return "".join([chr(x) for x in data])
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true : 13055
@requires_resource('network') @requires_resource('network')
@requires_resource('cpu') @requires_resource('cpu')
def test_normalization(self): def test_normalization(self):

View File

@@ -14,6 +14,7 @@ std = ["thiserror/std", "itertools/use_std"]
[dependencies] [dependencies]
rustpython-compiler-core = { workspace = true } rustpython-compiler-core = { workspace = true }
rustpython-unicode = { workspace = true, default-features = false }
rustpython-literal = {workspace = true } rustpython-literal = {workspace = true }
rustpython-wtf8 = { workspace = true } rustpython-wtf8 = { workspace = true }
ruff_python_ast = { workspace = true } ruff_python_ast = { workspace = true }
@@ -29,7 +30,6 @@ num-traits = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
malachite-bigint = { workspace = true } malachite-bigint = { workspace = true }
memchr = { workspace = true } memchr = { workspace = true }
unicode_names2 = { workspace = true }
[dev-dependencies] [dev-dependencies]
ruff_python_parser = { workspace = true } ruff_python_parser = { workspace = true }

View File

@@ -113,7 +113,9 @@ impl StringParser {
let name_and_ending = self.skip_bytes(close_idx + 1); let name_and_ending = self.skip_bytes(close_idx + 1);
let name = &name_and_ending[..name_and_ending.len() - 1]; let name = &name_and_ending[..name_and_ending.len() - 1];
unicode_names2::character(name).ok_or_else(|| unreachable!()) rustpython_unicode::data::lookup(name)
.and_then(char::from_u32)
.ok_or_else(|| unreachable!())
} }
/// Parse an escaped character, returning the new character. /// Parse an escaped character, returning the new character.

View File

@@ -16,6 +16,7 @@ wasm_js = ["getrandom/wasm_js"]
[dependencies] [dependencies]
rustpython-literal = { workspace = true } rustpython-literal = { workspace = true }
rustpython-unicode = { workspace = true, default-features = false }
rustpython-wtf8 = { workspace = true } rustpython-wtf8 = { workspace = true }
ascii = { workspace = true } ascii = { workspace = true }
@@ -29,7 +30,6 @@ malachite-q = { workspace = true }
malachite-base = { workspace = true } malachite-base = { workspace = true }
num-traits = { workspace = true } num-traits = { workspace = true }
parking_lot = { workspace = true, optional = true } parking_lot = { workspace = true, optional = true }
unicode_names2 = { workspace = true }
radium = { workspace = true } radium = { workspace = true }
lock_api = "0.4" lock_api = "0.4"

View File

@@ -414,7 +414,7 @@ pub mod errors {
let mut out = String::with_capacity(num_chars * 4); let mut out = String::with_capacity(num_chars * 4);
for c in err_str.code_points() { for c in err_str.code_points() {
let c_u32 = c.to_u32(); let c_u32 = c.to_u32();
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) { if let Some(c_name) = rustpython_unicode::data::name(c_u32) {
write!(out, "\\N{{{c_name}}}").unwrap(); write!(out, "\\N{{{c_name}}}").unwrap();
} else if c_u32 >= 0x10000 { } else if c_u32 >= 0x10000 {
write!(out, "\\U{c_u32:08x}").unwrap(); write!(out, "\\U{c_u32:08x}").unwrap();

View File

@@ -9,13 +9,13 @@ license = { workspace = true }
rust-version = { workspace = true } rust-version = { workspace = true }
[dependencies] [dependencies]
rustpython-unicode = { workspace = true, default-features = false }
rustpython-wtf8 = { workspace = true } rustpython-wtf8 = { workspace = true }
hexf-parse = "0.2.1" hexf-parse = "0.2.1"
is-macro.workspace = true is-macro.workspace = true
lexical-parse-float = { version = "1.0.6", features = ["format"] } lexical-parse-float = { version = "1.0.6", features = ["format"] }
num-traits = { workspace = true } num-traits = { workspace = true }
unic-ucd-category = { workspace = true }
[dev-dependencies] [dev-dependencies]
rand = { workspace = true } rand = { workspace = true }

View File

@@ -1,15 +0,0 @@
use unic_ucd_category::GeneralCategory;
/// According to python following categories aren't printable:
/// * Cc (Other, Control)
/// * Cf (Other, Format)
/// * Cs (Other, Surrogate)
/// * Co (Other, Private Use)
/// * Cn (Other, Not Assigned)
/// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
/// * Zs (Separator, Space) other than ASCII space('\x20').
pub fn is_printable(c: char) -> bool {
let cat = GeneralCategory::of(c);
!(cat.is_other() || cat.is_separator())
}

View File

@@ -204,7 +204,7 @@ impl UnicodeEscape<'_> {
'\\' | '\t' | '\r' | '\n' => 2, '\\' | '\t' | '\r' | '\n' => 2,
ch if ch < ' ' || ch as u32 == 0x7f => 4, // \xHH ch if ch < ' ' || ch as u32 == 0x7f => 4, // \xHH
ch if ch.is_ascii() => 1, ch if ch.is_ascii() => 1,
ch if crate::char::is_printable(ch) => { ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => {
// max = std::cmp::max(ch, max); // max = std::cmp::max(ch, max);
ch.len_utf8() ch.len_utf8()
} }
@@ -238,7 +238,9 @@ impl UnicodeEscape<'_> {
ch if ch.is_ascii() => { ch if ch.is_ascii() => {
write!(formatter, "\\x{:02x}", ch as u8) write!(formatter, "\\x{:02x}", ch as u8)
} }
ch if crate::char::is_printable(ch) => formatter.write_char(ch), ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => {
formatter.write_char(ch)
}
'\0'..='\u{ff}' => { '\0'..='\u{ff}' => {
write!(formatter, "\\x{:02x}", ch as u32) write!(formatter, "\\x{:02x}", ch as u32)
} }

View File

@@ -2,7 +2,6 @@
extern crate alloc; extern crate alloc;
pub mod char;
pub mod complex; pub mod complex;
pub mod escape; pub mod escape;
pub mod float; pub mod float;

View File

@@ -15,6 +15,7 @@ name = "benches"
harness = false harness = false
[dependencies] [dependencies]
rustpython-unicode = { workspace = true, default-features = false }
rustpython-wtf8 = { workspace = true } rustpython-wtf8 = { workspace = true }
num_enum = { workspace = true } num_enum = { workspace = true }
bitflags = { workspace = true } bitflags = { workspace = true }

View File

@@ -1,14 +1,10 @@
// good luck to those that follow; here be dragons // good luck to those that follow; here be dragons
use crate::string::{
is_digit, is_linebreak, is_loc_word, is_space, is_uni_digit, is_uni_linebreak, is_uni_space,
is_uni_word, is_word, lower_ascii, lower_locate, lower_unicode, upper_locate, upper_unicode,
};
use super::{MAXREPEAT, SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor}; use super::{MAXREPEAT, SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor};
use alloc::{vec, vec::Vec}; use alloc::{vec, vec::Vec};
use core::{convert::TryFrom, ptr::null}; use core::{convert::TryFrom, ptr::null};
use optional::Optioned; use optional::Optioned;
use rustpython_unicode::regex as unicode_regex;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct Request<'a, S> { pub struct Request<'a, S> {
@@ -659,10 +655,10 @@ fn _match<S: StrDrive>(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo
} }
SreOpcode::IN => general_op_in!(charset), SreOpcode::IN => general_op_in!(charset),
SreOpcode::IN_IGNORE => { SreOpcode::IN_IGNORE => {
general_op_in!(|set, c| charset(set, lower_ascii(c))) general_op_in!(|set, c| charset(set, unicode_regex::lower_ascii(c)))
} }
SreOpcode::IN_UNI_IGNORE => { SreOpcode::IN_UNI_IGNORE => {
general_op_in!(|set, c| charset(set, lower_unicode(c))) general_op_in!(|set, c| charset(set, unicode_regex::lower_unicode(c)))
} }
SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore), SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore),
SreOpcode::MARK => { SreOpcode::MARK => {
@@ -803,25 +799,31 @@ fn _match<S: StrDrive>(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo
SreOpcode::LITERAL => general_op_literal!(|code, c| code == c), SreOpcode::LITERAL => general_op_literal!(|code, c| code == c),
SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c), SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c),
SreOpcode::LITERAL_IGNORE => { SreOpcode::LITERAL_IGNORE => {
general_op_literal!(|code, c| code == lower_ascii(c)) general_op_literal!(|code, c| code == unicode_regex::lower_ascii(c))
} }
SreOpcode::NOT_LITERAL_IGNORE => { SreOpcode::NOT_LITERAL_IGNORE => {
general_op_literal!(|code, c| code != lower_ascii(c)) general_op_literal!(|code, c| code != unicode_regex::lower_ascii(c))
} }
SreOpcode::LITERAL_UNI_IGNORE => { SreOpcode::LITERAL_UNI_IGNORE => {
general_op_literal!(|code, c| code == lower_unicode(c)) general_op_literal!(|code, c| code == unicode_regex::lower_unicode(c))
} }
SreOpcode::NOT_LITERAL_UNI_IGNORE => { SreOpcode::NOT_LITERAL_UNI_IGNORE => {
general_op_literal!(|code, c| code != lower_unicode(c)) general_op_literal!(|code, c| code != unicode_regex::lower_unicode(c))
} }
SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore), SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore),
SreOpcode::NOT_LITERAL_LOC_IGNORE => { SreOpcode::NOT_LITERAL_LOC_IGNORE => {
general_op_literal!(|code, c| !char_loc_ignore(code, c)) general_op_literal!(|code, c| !char_loc_ignore(code, c))
} }
SreOpcode::GROUPREF => general_op_groupref!(|x| x), SreOpcode::GROUPREF => general_op_groupref!(|x| x),
SreOpcode::GROUPREF_IGNORE => general_op_groupref!(lower_ascii), SreOpcode::GROUPREF_IGNORE => {
SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref!(lower_locate), general_op_groupref!(unicode_regex::lower_ascii)
SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref!(lower_unicode), }
SreOpcode::GROUPREF_LOC_IGNORE => {
general_op_groupref!(unicode_regex::lower_locale)
}
SreOpcode::GROUPREF_UNI_IGNORE => {
general_op_groupref!(unicode_regex::lower_unicode)
}
SreOpcode::GROUPREF_EXISTS => { SreOpcode::GROUPREF_EXISTS => {
let (group_start, group_end) = let (group_start, group_end) =
state.marks.get(ctx.peek_code(req, 1) as usize); state.marks.get(ctx.peek_code(req, 1) as usize);
@@ -1125,7 +1127,7 @@ impl MatchContext {
} }
fn at_linebreak<S: StrDrive>(&self, req: &Request<'_, S>) -> bool { fn at_linebreak<S: StrDrive>(&self, req: &Request<'_, S>) -> bool {
!self.at_end(req) && is_linebreak(self.peek_char::<S>()) !self.at_end(req) && unicode_regex::is_linebreak(self.peek_char::<S>())
} }
fn at_boundary<S: StrDrive, F: FnMut(u32) -> bool>( fn at_boundary<S: StrDrive, F: FnMut(u32) -> bool>(
@@ -1192,54 +1194,56 @@ impl MatchContext {
fn at<S: StrDrive>(req: &Request<'_, S>, ctx: &MatchContext, at_code: SreAtCode) -> bool { fn at<S: StrDrive>(req: &Request<'_, S>, ctx: &MatchContext, at_code: SreAtCode) -> bool {
match at_code { match at_code {
SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(),
SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char::<S>()), SreAtCode::BEGINNING_LINE => {
SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), ctx.at_beginning() || unicode_regex::is_linebreak(ctx.back_peek_char::<S>())
SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), }
SreAtCode::BOUNDARY => ctx.at_boundary(req, unicode_regex::is_word),
SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_word),
SreAtCode::END => { SreAtCode::END => {
(ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req) (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req)
} }
SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req), SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req),
SreAtCode::END_STRING => ctx.at_end(req), SreAtCode::END_STRING => ctx.at_end(req),
SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word), SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_locale_word),
SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word), SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_locale_word),
SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word), SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_unicode_word),
SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word), SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_unicode_word),
} }
} }
fn char_loc_ignore(code: u32, c: u32) -> bool { fn char_loc_ignore(code: u32, c: u32) -> bool {
code == c || code == lower_locate(c) || code == upper_locate(c) code == c || code == unicode_regex::lower_locale(c) || code == unicode_regex::upper_locale(c)
} }
fn charset_loc_ignore(set: &[u32], c: u32) -> bool { fn charset_loc_ignore(set: &[u32], c: u32) -> bool {
let lo = lower_locate(c); let lo = unicode_regex::lower_locale(c);
if charset(set, c) { if charset(set, c) {
return true; return true;
} }
let up = upper_locate(c); let up = unicode_regex::upper_locale(c);
up != lo && charset(set, up) up != lo && charset(set, up)
} }
fn category(cat_code: SreCatCode, c: u32) -> bool { fn category(cat_code: SreCatCode, c: u32) -> bool {
match cat_code { match cat_code {
SreCatCode::DIGIT => is_digit(c), SreCatCode::DIGIT => unicode_regex::is_digit(c),
SreCatCode::NOT_DIGIT => !is_digit(c), SreCatCode::NOT_DIGIT => !unicode_regex::is_digit(c),
SreCatCode::SPACE => is_space(c), SreCatCode::SPACE => unicode_regex::is_space(c),
SreCatCode::NOT_SPACE => !is_space(c), SreCatCode::NOT_SPACE => !unicode_regex::is_space(c),
SreCatCode::WORD => is_word(c), SreCatCode::WORD => unicode_regex::is_word(c),
SreCatCode::NOT_WORD => !is_word(c), SreCatCode::NOT_WORD => !unicode_regex::is_word(c),
SreCatCode::LINEBREAK => is_linebreak(c), SreCatCode::LINEBREAK => unicode_regex::is_linebreak(c),
SreCatCode::NOT_LINEBREAK => !is_linebreak(c), SreCatCode::NOT_LINEBREAK => !unicode_regex::is_linebreak(c),
SreCatCode::LOC_WORD => is_loc_word(c), SreCatCode::LOC_WORD => unicode_regex::is_locale_word(c),
SreCatCode::LOC_NOT_WORD => !is_loc_word(c), SreCatCode::LOC_NOT_WORD => !unicode_regex::is_locale_word(c),
SreCatCode::UNI_DIGIT => is_uni_digit(c), SreCatCode::UNI_DIGIT => unicode_regex::is_unicode_digit(c),
SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c), SreCatCode::UNI_NOT_DIGIT => !unicode_regex::is_unicode_digit(c),
SreCatCode::UNI_SPACE => is_uni_space(c), SreCatCode::UNI_SPACE => unicode_regex::is_unicode_space(c),
SreCatCode::UNI_NOT_SPACE => !is_uni_space(c), SreCatCode::UNI_NOT_SPACE => !unicode_regex::is_unicode_space(c),
SreCatCode::UNI_WORD => is_uni_word(c), SreCatCode::UNI_WORD => unicode_regex::is_unicode_word(c),
SreCatCode::UNI_NOT_WORD => !is_uni_word(c), SreCatCode::UNI_NOT_WORD => !unicode_regex::is_unicode_word(c),
SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c), SreCatCode::UNI_LINEBREAK => unicode_regex::is_unicode_linebreak(c),
SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c), SreCatCode::UNI_NOT_LINEBREAK => !unicode_regex::is_unicode_linebreak(c),
} }
} }
@@ -1320,7 +1324,7 @@ fn charset(set: &[u32], ch: u32) -> bool {
if set[i + 1] <= ch && ch <= set[i + 2] { if set[i + 1] <= ch && ch <= set[i + 2] {
return ok; return ok;
} }
let ch = upper_unicode(ch); let ch = unicode_regex::upper_unicode(ch);
if set[i + 1] <= ch && ch <= set[i + 2] { if set[i + 1] <= ch && ch <= set[i + 2] {
return ok; return ok;
} }
@@ -1368,10 +1372,14 @@ fn _count<S: StrDrive>(
general_count_literal(req, ctx, end, |code, c| code != c); general_count_literal(req, ctx, end, |code, c| code != c);
} }
SreOpcode::LITERAL_IGNORE => { SreOpcode::LITERAL_IGNORE => {
general_count_literal(req, ctx, end, |code, c| code == lower_ascii(c)); general_count_literal(req, ctx, end, |code, c| {
code == unicode_regex::lower_ascii(c)
});
} }
SreOpcode::NOT_LITERAL_IGNORE => { SreOpcode::NOT_LITERAL_IGNORE => {
general_count_literal(req, ctx, end, |code, c| code != lower_ascii(c)); general_count_literal(req, ctx, end, |code, c| {
code != unicode_regex::lower_ascii(c)
});
} }
SreOpcode::LITERAL_LOC_IGNORE => { SreOpcode::LITERAL_LOC_IGNORE => {
general_count_literal(req, ctx, end, char_loc_ignore); general_count_literal(req, ctx, end, char_loc_ignore);
@@ -1380,10 +1388,14 @@ fn _count<S: StrDrive>(
general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c)); general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c));
} }
SreOpcode::LITERAL_UNI_IGNORE => { SreOpcode::LITERAL_UNI_IGNORE => {
general_count_literal(req, ctx, end, |code, c| code == lower_unicode(c)); general_count_literal(req, ctx, end, |code, c| {
code == unicode_regex::lower_unicode(c)
});
} }
SreOpcode::NOT_LITERAL_UNI_IGNORE => { SreOpcode::NOT_LITERAL_UNI_IGNORE => {
general_count_literal(req, ctx, end, |code, c| code != lower_unicode(c)); general_count_literal(req, ctx, end, |code, c| {
code != unicode_regex::lower_unicode(c)
});
} }
_ => { _ => {
/* General case */ /* General case */

View File

@@ -331,136 +331,3 @@ const fn utf8_is_cont_byte(byte: u8) -> bool {
/// Mask of the value bits of a continuation byte. /// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111; const CONT_MASK: u8 = 0b0011_1111;
const fn is_py_ascii_whitespace(b: u8) -> bool {
matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
}
#[inline]
pub(crate) fn is_word(ch: u32) -> bool {
ch == '_' as u32
|| u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_space(ch: u32) -> bool {
u8::try_from(ch)
.map(is_py_ascii_whitespace)
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_digit(ch: u32) -> bool {
u8::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_alnum(ch: u32) -> bool {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_word(ch: u32) -> bool {
ch == '_' as u32 || is_loc_alnum(ch)
}
#[inline]
pub(crate) const fn is_linebreak(ch: u32) -> bool {
ch == '\n' as u32
}
#[inline]
pub fn lower_ascii(ch: u32) -> u32 {
u8::try_from(ch)
.map(|x| x.to_ascii_lowercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn lower_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
lower_ascii(ch)
}
#[inline]
pub(crate) fn upper_locate(ch: u32) -> u32 {
// FIXME: Ignore the locales
u8::try_from(ch)
.map(|x| x.to_ascii_uppercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn is_uni_digit(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_space(ch: u32) -> bool {
// TODO: check with cpython
is_space(ch)
|| matches!(
ch,
0x0009
| 0x000A
| 0x000B
| 0x000C
| 0x000D
| 0x001C
| 0x001D
| 0x001E
| 0x001F
| 0x0020
| 0x0085
| 0x00A0
| 0x1680
| 0x2000
| 0x2001
| 0x2002
| 0x2003
| 0x2004
| 0x2005
| 0x2006
| 0x2007
| 0x2008
| 0x2009
| 0x200A
| 0x2028
| 0x2029
| 0x202F
| 0x205F
| 0x3000
)
}
#[inline]
pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
matches!(
ch,
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
)
}
#[inline]
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_word(ch: u32) -> bool {
ch == '_' as u32 || is_uni_alnum(ch)
}
#[inline]
pub fn lower_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_lowercase().next().unwrap() as u32)
.unwrap_or(ch)
}
#[inline]
pub fn upper_unicode(ch: u32) -> u32 {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.to_uppercase().next().unwrap() as u32)
.unwrap_or(ch)
}

View File

@@ -28,6 +28,7 @@ flame-it = ["flame"]
[dependencies] [dependencies]
# rustpython crates # rustpython crates
rustpython-derive = { workspace = true } rustpython-derive = { workspace = true }
rustpython-unicode = { workspace = true, features = ["casefold"] }
rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]} rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
rustpython-common = { workspace = true } rustpython-common = { workspace = true }
@@ -76,16 +77,6 @@ pbkdf2 = { version = "0.12", features = ["hmac"] }
constant_time_eq = { workspace = true } constant_time_eq = { workspace = true }
## unicode stuff ## unicode stuff
unicode_names2 = { workspace = true }
# update version all at the same time
unic-char-property = { workspace = true }
unic-normal = { workspace = true }
unic-ucd-bidi = { workspace = true }
unic-ucd-category = { workspace = true }
unic-ucd-age = { workspace = true }
ucd = "0.1.1"
unicode-bidi-mirroring = { workspace = true }
# compression # compression
adler32 = "1.2.0" adler32 = "1.2.0"
crc32fast = "1.3.2" crc32fast = "1.3.2"
@@ -94,8 +85,8 @@ libz-sys = { package = "libz-rs-sys", version = "0.5" }
bzip2 = "0.6" bzip2 = "0.6"
# tkinter # tkinter
tk-sys = { version = "0.2.0", optional = true } tk-sys = { git = "https://github.com/arihant2math/tkinter.git", tag = "v0.2.0", optional = true }
tcl-sys = { version = "0.2.0", optional = true } tcl-sys = { git = "https://github.com/arihant2math/tkinter.git", tag = "v0.2.0", optional = true }
widestring = { workspace = true, optional = true } widestring = { workspace = true, optional = true }
chrono.workspace = true chrono.workspace = true

View File

@@ -6,55 +6,30 @@
pub(crate) use unicodedata::module_def; pub(crate) use unicodedata::module_def;
use crate::vm::{
PyObject, PyResult, VirtualMachine, builtins::PyStr, convert::TryFromBorrowedObject,
};
enum NormalizeForm {
Nfc,
Nfkc,
Nfd,
Nfkd,
}
impl<'a> TryFromBorrowedObject<'a> for NormalizeForm {
fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
obj.try_value_with(
|form: &PyStr| match form.as_bytes() {
b"NFC" => Ok(Self::Nfc),
b"NFKC" => Ok(Self::Nfkc),
b"NFD" => Ok(Self::Nfd),
b"NFKD" => Ok(Self::Nfkd),
_ => Err(vm.new_value_error("invalid normalization form")),
},
vm,
)
}
}
#[pymodule] #[pymodule]
mod unicodedata { mod unicodedata {
use super::NormalizeForm::*;
use crate::vm::{ use crate::vm::{
Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
builtins::{PyModule, PyStrRef}, builtins::{PyModule, PyStrRef},
function::OptionalArg, function::OptionalArg,
}; };
use itertools::Itertools; use itertools::Itertools;
use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType}; use rustpython_unicode::{NormalizeForm, UNICODE_VERSION, UnicodeVersion, data};
use unic_char_property::EnumeratedCharProperty;
use unic_normal::StrNormalForm; fn parse_normalize_form(form: PyStrRef, vm: &VirtualMachine) -> PyResult<NormalizeForm> {
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; form.to_str()
use unic_ucd_bidi::BidiClass; .ok_or_else(|| vm.new_value_error("invalid normalization form"))?
use unic_ucd_category::GeneralCategory; .parse()
use unicode_bidi_mirroring::is_mirroring; .map_err(|()| vm.new_value_error("invalid normalization form"))
}
pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> { pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
__module_exec(vm, module); __module_exec(vm, module);
// Add UCD methods as module-level functions // Add UCD methods as module-level functions
let ucd: PyObjectRef = Ucd::new(UNICODE_VERSION).into_ref(&vm.ctx).into(); let ucd: PyObjectRef = PyUcd::new(data::Ucd::default()).into_ref(&vm.ctx).into();
for attr in [ for attr in [
"category", "category",
@@ -80,56 +55,40 @@ mod unicodedata {
#[pyattr] #[pyattr]
#[pyclass(name = "UCD")] #[pyclass(name = "UCD")]
#[derive(Debug, PyPayload)] #[derive(Debug, PyPayload)]
pub(super) struct Ucd { pub(super) struct PyUcd(data::Ucd);
unic_version: UnicodeVersion,
}
impl Ucd { impl PyUcd {
pub const fn new(unic_version: UnicodeVersion) -> Self { pub const fn new(ucd: data::Ucd) -> Self {
Self { unic_version } Self(ucd)
} }
fn check_age(&self, c: CodePoint) -> bool { fn extract_char(character: PyStrRef, vm: &VirtualMachine) -> PyResult<CodePoint> {
c.to_char() character
.is_none_or(|c| Age::of(c).is_some_and(|age| age.actual() <= self.unic_version))
}
fn extract_char(
&self,
character: PyStrRef,
vm: &VirtualMachine,
) -> PyResult<Option<CodePoint>> {
let c = character
.as_wtf8() .as_wtf8()
.code_points() .code_points()
.exactly_one() .exactly_one()
.map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))?; .map_err(|_| vm.new_type_error("argument must be a Unicode character, not str"))
Ok(self.check_age(c).then_some(c))
} }
} }
#[pyclass(flags(DISALLOW_INSTANTIATION))] #[pyclass(flags(DISALLOW_INSTANTIATION))]
impl Ucd { impl PyUcd {
#[pymethod] #[pymethod]
fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> { fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
Ok(self Ok(self
.extract_char(character, vm)? .0
.map_or(GeneralCategory::Unassigned, |c| { .category(Self::extract_char(character, vm)?.to_u32())
c.to_char()
.map_or(GeneralCategory::Surrogate, GeneralCategory::of)
})
.abbr_name()
.to_owned()) .to_owned())
} }
#[pymethod] #[pymethod]
fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> { fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
if let Some(name_str) = name.to_str() if let Some(name_str) = name.to_str()
&& let Some(character) = unicode_names2::character(name_str) && let Some(character) = self.0.lookup(name_str)
&& self.check_age(character.into())
{ {
return Ok(character.to_string()); return Ok(char::from_u32(character)
.expect("unicode_names2 only returns Unicode scalar values")
.to_string());
} }
Err(vm.new_key_error( Err(vm.new_key_error(
vm.ctx vm.ctx
@@ -145,13 +104,8 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>, default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine, vm: &VirtualMachine,
) -> PyResult { ) -> PyResult {
let c = self.extract_char(character, vm)?; if let Some(name) = self.0.name(Self::extract_char(character, vm)?.to_u32()) {
return Ok(vm.ctx.new_str(name).into());
if let Some(c) = c
&& self.check_age(c)
&& let Some(name) = c.to_char().and_then(unicode_names2::name)
{
return Ok(vm.ctx.new_str(name.to_string()).into());
} }
default.ok_or_else(|| vm.new_value_error("no such name")) default.ok_or_else(|| vm.new_value_error("no such name"))
} }
@@ -162,14 +116,9 @@ mod unicodedata {
character: PyStrRef, character: PyStrRef,
vm: &VirtualMachine, vm: &VirtualMachine,
) -> PyResult<&'static str> { ) -> PyResult<&'static str> {
let bidi = match self.extract_char(character, vm)? { Ok(self
Some(c) => c .0
.to_char() .bidirectional(Self::extract_char(character, vm)?.to_u32()))
.map_or(BidiClass::LeftToRight, BidiClass::of)
.abbr_name(),
None => "",
};
Ok(bidi)
} }
/// NOTE: This function uses 9.0.0 database instead of 3.2.0 /// NOTE: This function uses 9.0.0 database instead of 3.2.0
@@ -180,76 +129,51 @@ mod unicodedata {
vm: &VirtualMachine, vm: &VirtualMachine,
) -> PyResult<&'static str> { ) -> PyResult<&'static str> {
Ok(self Ok(self
.extract_char(character, vm)? .0
.and_then(|c| c.to_char()) .east_asian_width(Self::extract_char(character, vm)?.to_u32()))
.map_or(EastAsianWidth::Neutral, |c| c.east_asian_width())
.abbr_name())
} }
#[pymethod] #[pymethod]
fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> { fn normalize(
let text = unistr.as_wtf8(); &self,
let normalized_text = match form { form: PyStrRef,
Nfc => text.map_utf8(|s| s.nfc()).collect(), unistr: PyStrRef,
Nfkc => text.map_utf8(|s| s.nfkc()).collect(), vm: &VirtualMachine,
Nfd => text.map_utf8(|s| s.nfd()).collect(), ) -> PyResult<Wtf8Buf> {
Nfkd => text.map_utf8(|s| s.nfkd()).collect(), Ok(self
}; .0
Ok(normalized_text) .normalize(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
} }
#[pymethod] #[pymethod]
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> { fn is_normalized(
let text = unistr.as_wtf8(); &self,
let normalized: Wtf8Buf = match form { form: PyStrRef,
Nfc => text.map_utf8(|s| s.nfc()).collect(), unistr: PyStrRef,
Nfkc => text.map_utf8(|s| s.nfkc()).collect(), vm: &VirtualMachine,
Nfd => text.map_utf8(|s| s.nfd()).collect(), ) -> PyResult<bool> {
Nfkd => text.map_utf8(|s| s.nfkd()).collect(), Ok(self
}; .0
Ok(text == &*normalized) .is_normalized(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
} }
#[pymethod] #[pymethod]
fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> { fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
match self.extract_char(character, vm)? { Ok(self.0.mirrored(Self::extract_char(character, vm)?.to_u32()) as i32)
Some(c) => {
if let Some(ch) = c.to_char() {
// Check if the character is mirrored in bidirectional text using Unicode standard
Ok(if is_mirroring(ch) { 1 } else { 0 })
} else {
Ok(0)
}
}
None => Ok(0),
}
} }
#[pymethod] #[pymethod]
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> { fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
Ok(self Ok(self
.extract_char(character, vm)? .0
.and_then(|c| c.to_char()) .combining(Self::extract_char(character, vm)?.to_u32()))
.map_or(0, |ch| ch.canonical_combining_class() as i32))
} }
#[pymethod] #[pymethod]
fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> { fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) { Ok(self
Some(ch) => ch, .0
None => return Ok(String::new()), .decomposition(Self::extract_char(character, vm)?.to_u32()))
};
let chars: Vec<char> = ch.decomposition_map().collect();
// If decomposition maps to just the character itself, there's no decomposition
if chars.len() == 1 && chars[0] == ch {
return Ok(String::new());
}
let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" ");
let tag = match ch.decomposition_type() {
Some(DecompositionType::Canonical) | None => return Ok(hex_parts),
Some(dt) => decomposition_type_tag(dt),
};
Ok(format!("<{tag}> {hex_parts}"))
} }
#[pymethod] #[pymethod]
@@ -259,15 +183,8 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>, default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine, vm: &VirtualMachine,
) -> PyResult { ) -> PyResult {
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); if let Some(value) = self.0.digit(Self::extract_char(character, vm)?.to_u32()) {
if let Some(ch) = ch return Ok(vm.ctx.new_int(value).into());
&& matches!(
ch.numeric_type(),
Some(NumericType::Decimal) | Some(NumericType::Digit)
)
&& let Some(Number::Integer(n)) = ch.numeric_value()
{
return Ok(vm.ctx.new_int(n).into());
} }
default.ok_or_else(|| vm.new_value_error("not a digit")) default.ok_or_else(|| vm.new_value_error("not a digit"))
} }
@@ -279,12 +196,8 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>, default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine, vm: &VirtualMachine,
) -> PyResult { ) -> PyResult {
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); if let Some(value) = self.0.decimal(Self::extract_char(character, vm)?.to_u32()) {
if let Some(ch) = ch return Ok(vm.ctx.new_int(value).into());
&& ch.numeric_type() == Some(NumericType::Decimal)
&& let Some(Number::Integer(n)) = ch.numeric_value()
{
return Ok(vm.ctx.new_int(n).into());
} }
default.ok_or_else(|| vm.new_value_error("not a decimal")) default.ok_or_else(|| vm.new_value_error("not a decimal"))
} }
@@ -296,75 +209,29 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>, default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine, vm: &VirtualMachine,
) -> PyResult { ) -> PyResult {
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); if let Some(value) = self.0.numeric(Self::extract_char(character, vm)?.to_u32()) {
if let Some(ch) = ch { let value = match value {
match ch.numeric_value() { data::NumericValue::Integer(n) => n as f64,
Some(Number::Integer(n)) => { data::NumericValue::Rational(num, den) => num as f64 / den as f64,
return Ok(vm.ctx.new_float(n as f64).into()); };
} return Ok(vm.ctx.new_float(value).into());
Some(Number::Rational(num, den)) => {
return Ok(vm.ctx.new_float(num as f64 / den as f64).into());
}
None => {}
}
} }
default.ok_or_else(|| vm.new_value_error("not a numeric character")) default.ok_or_else(|| vm.new_value_error("not a numeric character"))
} }
#[pygetset] #[pygetset]
fn unidata_version(&self) -> String { fn unidata_version(&self) -> String {
self.unic_version.to_string() self.0.unicode_version().to_string()
}
}
fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
match dt {
DecompositionType::Canonical => "canonical",
DecompositionType::Compat => "compat",
DecompositionType::Circle => "circle",
DecompositionType::Final => "final",
DecompositionType::Font => "font",
DecompositionType::Fraction => "fraction",
DecompositionType::Initial => "initial",
DecompositionType::Isolated => "isolated",
DecompositionType::Medial => "medial",
DecompositionType::Narrow => "narrow",
DecompositionType::Nobreak => "noBreak",
DecompositionType::Small => "small",
DecompositionType::Square => "square",
DecompositionType::Sub => "sub",
DecompositionType::Super => "super",
DecompositionType::Vertical => "vertical",
DecompositionType::Wide => "wide",
}
}
trait EastAsianWidthAbbrName {
fn abbr_name(&self) -> &'static str;
}
impl EastAsianWidthAbbrName for EastAsianWidth {
fn abbr_name(&self) -> &'static str {
match self {
Self::Narrow => "Na",
Self::Wide => "W",
Self::Neutral => "N",
Self::Ambiguous => "A",
Self::FullWidth => "F",
Self::HalfWidth => "H",
}
} }
} }
#[pyattr] #[pyattr]
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> { fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<PyUcd> {
Ucd { PyUcd::new(data::Ucd::new(UnicodeVersion {
unic_version: UnicodeVersion { major: 3,
major: 3, minor: 2,
minor: 2, micro: 0,
micro: 0, }))
},
}
.into_ref(&vm.ctx) .into_ref(&vm.ctx)
} }

29
crates/unicode/Cargo.toml Normal file
View File

@@ -0,0 +1,29 @@
[package]
name = "rustpython-unicode"
description = "Shared Unicode semantics and data for RustPython and related Python tooling."
version.workspace = true
authors.workspace = true
edition.workspace = true
rust-version.workspace = true
repository.workspace = true
license.workspace = true
[features]
default = ["std", "casefold"]
std = []
casefold = ["std", "dep:caseless"]
[dependencies]
rustpython-wtf8 = { workspace = true }
icu_normalizer = { workspace = true }
icu_properties = { workspace = true }
itertools = { workspace = true }
unicode-casing = { workspace = true }
unicode_names2 = { version = "2.0.0", default-features = false, features = ["no_std"] }
unic-ucd-age = { workspace = true }
ucd = "0.1.1"
caseless = { version = "0.2.2", optional = true }
[lints]
workspace = true

111
crates/unicode/src/case.rs Normal file
View File

@@ -0,0 +1,111 @@
#[cfg(feature = "casefold")]
use alloc::string::String;
#[cfg(feature = "casefold")]
use rustpython_wtf8::Wtf8Chunk;
use rustpython_wtf8::{Wtf8, Wtf8Buf};
use unicode_casing::CharExt;
use crate::char_from_codepoint;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct CaseMapping {
len: u8,
codepoints: [u32; 3],
}
impl CaseMapping {
pub const fn identity(cp: u32) -> Self {
Self {
len: 1,
codepoints: [cp, 0, 0],
}
}
pub const fn first(self) -> Option<u32> {
if self.len == 0 {
None
} else {
Some(self.codepoints[0])
}
}
pub fn iter(self) -> impl Iterator<Item = u32> {
self.codepoints.into_iter().take(usize::from(self.len))
}
}
fn mapping_from_chars(chars: impl Iterator<Item = char>) -> CaseMapping {
let mut codepoints = [0; 3];
let mut len = 0;
for ch in chars.take(codepoints.len()) {
codepoints[len] = ch as u32;
len += 1;
}
CaseMapping {
len: len as u8,
codepoints,
}
}
#[cfg(feature = "casefold")]
fn mapping_from_string(text: String) -> CaseMapping {
mapping_from_chars(text.chars())
}
pub fn to_lowercase(cp: u32) -> CaseMapping {
char_from_codepoint(cp).map_or_else(
|| CaseMapping::identity(cp),
|ch| mapping_from_chars(ch.to_lowercase()),
)
}
pub fn to_uppercase(cp: u32) -> CaseMapping {
char_from_codepoint(cp).map_or_else(
|| CaseMapping::identity(cp),
|ch| mapping_from_chars(ch.to_uppercase()),
)
}
pub fn to_titlecase(cp: u32) -> CaseMapping {
char_from_codepoint(cp).map_or_else(
|| CaseMapping::identity(cp),
|ch| mapping_from_chars(ch.to_titlecase()),
)
}
pub fn to_lowercase_wtf8(text: &Wtf8) -> Wtf8Buf {
text.map_utf8(|s| s.chars().flat_map(char::to_lowercase))
.collect()
}
pub fn to_uppercase_wtf8(text: &Wtf8) -> Wtf8Buf {
text.map_utf8(|s| s.chars().flat_map(char::to_uppercase))
.collect()
}
#[cfg(feature = "casefold")]
pub fn casefold(cp: u32) -> CaseMapping {
char_from_codepoint(cp).map_or_else(
|| CaseMapping::identity(cp),
|ch| {
let mut buf = [0; 4];
mapping_from_string(caseless::default_case_fold_str(ch.encode_utf8(&mut buf)))
},
)
}
#[cfg(feature = "casefold")]
pub fn casefold_str(text: &str) -> String {
caseless::default_case_fold_str(text)
}
#[cfg(feature = "casefold")]
pub fn casefold_wtf8(text: &Wtf8) -> Wtf8Buf {
text.chunks()
.map(|chunk| match chunk {
Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(casefold_str(s)),
Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c),
})
.collect()
}

View File

@@ -0,0 +1,67 @@
use icu_properties::props::{BidiClass, EnumeratedProperty, GeneralCategory};
use ucd::{Codepoint, NumericType};
use crate::{char_from_codepoint, is_surrogate};
pub fn general_category(cp: u32) -> GeneralCategory {
if is_surrogate(cp) {
GeneralCategory::Surrogate
} else {
char_from_codepoint(cp).map_or(GeneralCategory::Unassigned, GeneralCategory::for_char)
}
}
pub fn is_alpha(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(char::is_alphabetic)
}
pub fn is_alnum(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(char::is_alphanumeric)
}
pub fn is_decimal(cp: u32) -> bool {
matches!(general_category(cp), GeneralCategory::DecimalNumber)
}
pub fn is_digit(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(|ch| {
matches!(
ch.numeric_type(),
Some(NumericType::Decimal) | Some(NumericType::Digit)
)
})
}
pub fn is_numeric(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(|ch| ch.numeric_value().is_some())
}
pub fn is_space(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(|ch| {
matches!(general_category(cp), GeneralCategory::SpaceSeparator)
|| matches!(
BidiClass::for_char(ch),
BidiClass::WhiteSpace | BidiClass::ParagraphSeparator | BidiClass::SegmentSeparator
)
})
}
/// Python's `str.isprintable()` semantics, which treat ASCII space as printable.
pub fn is_printable(cp: u32) -> bool {
cp == '\u{0020}' as u32 || is_repr_printable(cp)
}
/// Repr/escape printable semantics, which exclude all Unicode space separators.
pub fn is_repr_printable(cp: u32) -> bool {
!matches!(
general_category(cp),
GeneralCategory::SpaceSeparator
| GeneralCategory::LineSeparator
| GeneralCategory::ParagraphSeparator
| GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
)
}

230
crates/unicode/src/data.rs Normal file
View File

@@ -0,0 +1,230 @@
use alloc::{format, string::String, vec::Vec};
use icu_properties::{
CodePointSetData,
props::{
BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty,
NamedEnumeratedProperty,
},
};
use itertools::Itertools;
use ucd::{Codepoint, DecompositionType, Number, NumericType};
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
use crate::{char_from_codepoint, classify, is_surrogate};
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum NumericValue {
Integer(i64),
Rational(i64, i64),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Ucd {
unic_version: UnicodeVersion,
}
impl Default for Ucd {
fn default() -> Self {
Self::new(UNICODE_VERSION)
}
}
impl Ucd {
pub const fn new(unic_version: UnicodeVersion) -> Self {
Self { unic_version }
}
pub const fn unicode_version(&self) -> UnicodeVersion {
self.unic_version
}
pub fn category(&self, cp: u32) -> &'static str {
if self.contains(cp) {
category(cp)
} else {
"Cn"
}
}
pub fn lookup(&self, name: &str) -> Option<u32> {
let cp = lookup(name)?;
self.contains(cp).then_some(cp)
}
pub fn name(&self, cp: u32) -> Option<String> {
self.contains(cp).then(|| name(cp)).flatten()
}
pub fn bidirectional(&self, cp: u32) -> &'static str {
if self.contains(cp) {
bidirectional(cp)
} else {
""
}
}
pub fn east_asian_width(&self, cp: u32) -> &'static str {
if self.contains(cp) {
east_asian_width(cp)
} else {
"N"
}
}
pub fn normalize(
&self,
form: crate::NormalizeForm,
text: &rustpython_wtf8::Wtf8,
) -> rustpython_wtf8::Wtf8Buf {
crate::normalize::normalize(form, text)
}
pub fn is_normalized(&self, form: crate::NormalizeForm, text: &rustpython_wtf8::Wtf8) -> bool {
crate::normalize::is_normalized(form, text)
}
pub fn mirrored(&self, cp: u32) -> bool {
self.contains(cp) && mirrored(cp)
}
pub fn combining(&self, cp: u32) -> u8 {
if self.contains(cp) { combining(cp) } else { 0 }
}
pub fn decomposition(&self, cp: u32) -> String {
if self.contains(cp) {
decomposition(cp)
} else {
String::new()
}
}
pub fn digit(&self, cp: u32) -> Option<u32> {
self.contains(cp).then(|| digit(cp)).flatten()
}
pub fn decimal(&self, cp: u32) -> Option<u32> {
self.contains(cp).then(|| decimal(cp)).flatten()
}
pub fn numeric(&self, cp: u32) -> Option<NumericValue> {
self.contains(cp).then(|| numeric(cp)).flatten()
}
fn contains(&self, cp: u32) -> bool {
is_assigned_in_version(cp, self.unic_version)
}
}
pub fn is_assigned_in_version(cp: u32, version: UnicodeVersion) -> bool {
if is_surrogate(cp) {
true
} else {
char_from_codepoint(cp)
.is_some_and(|ch| Age::of(ch).is_some_and(|age| age.actual() <= version))
}
}
pub fn category(cp: u32) -> &'static str {
classify::general_category(cp).short_name()
}
pub fn lookup(name: &str) -> Option<u32> {
unicode_names2::character(name).map(u32::from)
}
pub fn name(cp: u32) -> Option<String> {
char_from_codepoint(cp)
.and_then(unicode_names2::name)
.map(|name| name.collect())
}
pub fn bidirectional(cp: u32) -> &'static str {
char_from_codepoint(cp)
.map_or(BidiClass::LeftToRight, BidiClass::for_char)
.short_name()
}
pub fn east_asian_width(cp: u32) -> &'static str {
char_from_codepoint(cp)
.map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char)
.short_name()
}
pub fn mirrored(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(|ch| CodePointSetData::new::<BidiMirrored>().contains(ch))
}
pub fn combining(cp: u32) -> u8 {
char_from_codepoint(cp).map_or(0, |ch| {
CanonicalCombiningClass::for_char(ch).to_icu4c_value()
})
}
pub fn decomposition(cp: u32) -> String {
let ch = match char_from_codepoint(cp) {
Some(ch) => ch,
None => return String::new(),
};
let chars: Vec<char> = ch.decomposition_map().collect();
if chars.len() == 1 && chars[0] == ch {
return String::new();
}
let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" ");
match ch.decomposition_type() {
Some(DecompositionType::Canonical) | None => hex_parts,
Some(dt) => format!("<{}> {hex_parts}", decomposition_type_tag(dt)),
}
}
pub fn digit(cp: u32) -> Option<u32> {
let ch = char_from_codepoint(cp)?;
if matches!(
ch.numeric_type(),
Some(NumericType::Decimal) | Some(NumericType::Digit)
) && let Some(Number::Integer(value)) = ch.numeric_value()
{
return u32::try_from(value).ok();
}
None
}
pub fn decimal(cp: u32) -> Option<u32> {
let ch = char_from_codepoint(cp)?;
if ch.numeric_type() == Some(NumericType::Decimal)
&& let Some(Number::Integer(value)) = ch.numeric_value()
{
return u32::try_from(value).ok();
}
None
}
pub fn numeric(cp: u32) -> Option<NumericValue> {
match char_from_codepoint(cp)?.numeric_value()? {
Number::Integer(value) => Some(NumericValue::Integer(value)),
Number::Rational(num, den) => Some(NumericValue::Rational(num.into(), den.into())),
}
}
fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
match dt {
DecompositionType::Canonical => "canonical",
DecompositionType::Compat => "compat",
DecompositionType::Circle => "circle",
DecompositionType::Final => "final",
DecompositionType::Font => "font",
DecompositionType::Fraction => "fraction",
DecompositionType::Initial => "initial",
DecompositionType::Isolated => "isolated",
DecompositionType::Medial => "medial",
DecompositionType::Narrow => "narrow",
DecompositionType::Nobreak => "noBreak",
DecompositionType::Small => "small",
DecompositionType::Square => "square",
DecompositionType::Sub => "sub",
DecompositionType::Super => "super",
DecompositionType::Vertical => "vertical",
DecompositionType::Wide => "wide",
}
}

View File

@@ -0,0 +1,27 @@
use icu_properties::props::{BinaryProperty, XidContinue, XidStart};
use crate::char_from_codepoint;
pub fn is_xid_start(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(XidStart::for_char)
}
pub fn is_xid_continue(cp: u32) -> bool {
char_from_codepoint(cp).is_some_and(XidContinue::for_char)
}
pub fn is_python_identifier_start(cp: u32) -> bool {
cp == '_' as u32 || is_xid_start(cp)
}
pub fn is_python_identifier_continue(cp: u32) -> bool {
is_xid_continue(cp)
}
pub fn is_python_identifier(text: &str) -> bool {
let mut chars = text.chars();
let is_identifier_start = chars
.next()
.is_some_and(|ch| is_python_identifier_start(ch as u32));
is_identifier_start && chars.all(|ch| is_python_identifier_continue(ch as u32))
}

77
crates/unicode/src/lib.rs Normal file
View File

@@ -0,0 +1,77 @@
#![cfg_attr(not(feature = "std"), no_std)]
extern crate alloc;
pub mod case;
pub mod classify;
pub mod data;
pub mod identifier;
pub mod normalize;
pub mod regex;
pub use normalize::NormalizeForm;
pub use unic_ucd_age::{UNICODE_VERSION, UnicodeVersion};
use core::char;
pub(crate) fn char_from_codepoint(cp: u32) -> Option<char> {
char::from_u32(cp)
}
pub(crate) const fn is_surrogate(cp: u32) -> bool {
matches!(cp, 0xD800..=0xDFFF)
}
#[cfg(test)]
mod tests {
use alloc::vec::Vec;
use rustpython_wtf8::Wtf8Buf;
use crate::{NormalizeForm, case, classify, data, identifier, normalize, regex};
#[test]
fn printable_and_repr_printable_follow_python_rules() {
assert!(classify::is_printable(' ' as u32));
assert!(!classify::is_repr_printable(' ' as u32));
assert!(!classify::is_printable('\n' as u32));
}
#[test]
fn identifier_and_regex_predicates_share_unicode_tables() {
assert!(identifier::is_python_identifier_start('_' as u32));
assert!(identifier::is_python_identifier("유니코드"));
assert!(regex::is_unicode_word('가' as u32));
assert!(regex::is_unicode_digit('५' as u32));
assert!(regex::is_unicode_space('\u{3000}' as u32));
}
#[test]
fn case_and_normalization_helpers_support_full_mappings() {
let upper: Vec<_> = case::to_uppercase('ß' as u32).iter().collect();
assert_eq!(upper, vec!['S' as u32, 'S' as u32]);
let text = Wtf8Buf::from("e\u{301}");
assert_eq!(
normalize::normalize(NormalizeForm::Nfc, &text),
Wtf8Buf::from("é")
);
assert!(normalize::is_normalized(
NormalizeForm::Nfd,
&normalize::normalize(NormalizeForm::Nfd, &Wtf8Buf::from("é"))
));
}
#[test]
fn unicode_data_queries_match_existing_unicodedata_behavior() {
assert_eq!(data::category('A' as u32), "Lu");
assert_eq!(data::category(0xD800), "Cs");
assert_eq!(data::lookup("SNOWMAN"), Some('☃' as u32));
assert_eq!(data::name('☃' as u32).as_deref(), Some("SNOWMAN"));
assert_eq!(data::decimal('५' as u32), Some(5));
assert_eq!(data::digit('²' as u32), Some(2));
assert_eq!(
data::numeric('⅓' as u32),
Some(data::NumericValue::Rational(1, 3))
);
}
}

View File

@@ -0,0 +1,55 @@
use core::str::FromStr;
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
use rustpython_wtf8::{Wtf8, Wtf8Buf};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NormalizeForm {
Nfc,
Nfkc,
Nfd,
Nfkd,
}
impl FromStr for NormalizeForm {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"NFC" => Ok(Self::Nfc),
"NFKC" => Ok(Self::Nfkc),
"NFD" => Ok(Self::Nfd),
"NFKD" => Ok(Self::Nfkd),
_ => Err(()),
}
}
}
pub fn normalize(form: NormalizeForm, text: &Wtf8) -> Wtf8Buf {
match form {
NormalizeForm::Nfc => {
let normalizer = ComposingNormalizerBorrowed::new_nfc();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
NormalizeForm::Nfkc => {
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
NormalizeForm::Nfd => {
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
NormalizeForm::Nfkd => {
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
}
}
pub fn is_normalized(form: NormalizeForm, text: &Wtf8) -> bool {
let normalized = normalize(form, text);
text == &*normalized
}

View File

@@ -0,0 +1,87 @@
use crate::{case, classify};
const UNDERSCORE: u32 = '_' as u32;
const fn is_py_ascii_whitespace(byte: u8) -> bool {
matches!(byte, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
}
pub fn is_word(cp: u32) -> bool {
cp == UNDERSCORE
|| u8::try_from(cp)
.map(|byte| byte.is_ascii_alphanumeric())
.unwrap_or(false)
}
pub fn is_space(cp: u32) -> bool {
u8::try_from(cp)
.map(is_py_ascii_whitespace)
.unwrap_or(false)
}
pub fn is_digit(cp: u32) -> bool {
u8::try_from(cp)
.map(|byte| byte.is_ascii_digit())
.unwrap_or(false)
}
pub fn is_locale_alnum(cp: u32) -> bool {
u8::try_from(cp)
.map(|byte| byte.is_ascii_alphanumeric())
.unwrap_or(false)
}
pub fn is_locale_word(cp: u32) -> bool {
cp == UNDERSCORE || is_locale_alnum(cp)
}
pub const fn is_linebreak(cp: u32) -> bool {
cp == '\n' as u32
}
pub fn lower_ascii(cp: u32) -> u32 {
u8::try_from(cp)
.map(|byte| byte.to_ascii_lowercase() as u32)
.unwrap_or(cp)
}
pub fn lower_locale(cp: u32) -> u32 {
lower_ascii(cp)
}
pub fn upper_locale(cp: u32) -> u32 {
u8::try_from(cp)
.map(|byte| byte.to_ascii_uppercase() as u32)
.unwrap_or(cp)
}
pub fn is_unicode_digit(cp: u32) -> bool {
classify::is_decimal(cp)
}
pub fn is_unicode_space(cp: u32) -> bool {
classify::is_space(cp)
}
pub const fn is_unicode_linebreak(cp: u32) -> bool {
matches!(
cp,
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
)
}
pub fn is_unicode_alnum(cp: u32) -> bool {
classify::is_alnum(cp)
}
pub fn is_unicode_word(cp: u32) -> bool {
cp == UNDERSCORE || is_unicode_alnum(cp)
}
pub fn lower_unicode(cp: u32) -> u32 {
case::to_lowercase(cp).first().unwrap_or(cp)
}
pub fn upper_unicode(cp: u32) -> u32 {
case::to_uppercase(cp).first().unwrap_or(cp)
}

View File

@@ -41,6 +41,7 @@ ruff_text_size = { workspace = true, optional = true }
rustpython-compiler-core = { workspace = true } rustpython-compiler-core = { workspace = true }
rustpython-literal = { workspace = true } rustpython-literal = { workspace = true }
rustpython-sre_engine = { workspace = true } rustpython-sre_engine = { workspace = true }
rustpython-unicode = { workspace = true, features = ["casefold"] }
ascii = { workspace = true } ascii = { workspace = true }
ahash = { workspace = true } ahash = { workspace = true }
@@ -74,7 +75,6 @@ strum_macros = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
memchr = { workspace = true } memchr = { workspace = true }
caseless = "0.2.2"
flamer = { version = "0.5", optional = true } flamer = { version = "0.5", optional = true }
half = "2" half = "2"
psm = "0.1" psm = "0.1"
@@ -86,10 +86,6 @@ timsort = "0.1.2"
# TODO: use unic for this; needed for title case: # TODO: use unic for this; needed for title case:
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 # https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
unicode-casing = { workspace = true } unicode-casing = { workspace = true }
# update version all at the same time
unic-ucd-bidi = { workspace = true }
unic-ucd-category = { workspace = true }
unic-ucd-ident = { workspace = true }
[target.'cfg(unix)'.dependencies] [target.'cfg(unix)'.dependencies]
rustix = { workspace = true } rustix = { workspace = true }

View File

@@ -396,6 +396,10 @@ pub trait AnyStr {
fn py_zfill(&self, width: isize) -> Vec<u8> { fn py_zfill(&self, width: isize) -> Vec<u8> {
let width = width.to_usize().unwrap_or(0); let width = width.to_usize().unwrap_or(0);
let char_len = self.elements().count();
let width = self
.bytes_len()
.saturating_add(width.saturating_sub(char_len));
rustpython_common::str::zfill(self.as_bytes(), width) rustpython_common::str::zfill(self.as_bytes(), width)
} }

View File

@@ -41,11 +41,9 @@ use rustpython_common::{
hash, hash,
lock::PyMutex, lock::PyMutex,
str::DeduceStrKind, str::DeduceStrKind,
wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Concat},
}; };
use unic_ucd_bidi::BidiClass;
use unic_ucd_category::GeneralCategory;
use unic_ucd_ident::{is_xid_continue, is_xid_start};
use unicode_casing::CharExt; use unicode_casing::CharExt;
impl<'a> TryFromBorrowedObject<'a> for String { impl<'a> TryFromBorrowedObject<'a> for String {
@@ -697,7 +695,7 @@ impl PyStr {
match self.as_str_kind() { match self.as_str_kind() {
PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(), PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(),
PyKindStr::Utf8(s) => s.to_lowercase().into(), PyKindStr::Utf8(s) => s.to_lowercase().into(),
PyKindStr::Wtf8(w) => w.to_lowercase().into(), PyKindStr::Wtf8(w) => rustpython_unicode::case::to_lowercase_wtf8(w).into(),
} }
} }
@@ -705,16 +703,9 @@ impl PyStr {
#[pymethod] #[pymethod]
fn casefold(&self) -> Self { fn casefold(&self) -> Self {
match self.as_str_kind() { match self.as_str_kind() {
PyKindStr::Ascii(s) => caseless::default_case_fold_str(s.as_str()).into(), PyKindStr::Ascii(s) => rustpython_unicode::case::casefold_str(s.as_str()).into(),
PyKindStr::Utf8(s) => caseless::default_case_fold_str(s).into(), PyKindStr::Utf8(s) => rustpython_unicode::case::casefold_str(s).into(),
PyKindStr::Wtf8(w) => w PyKindStr::Wtf8(w) => rustpython_unicode::case::casefold_wtf8(w).into(),
.chunks()
.map(|c| match c {
Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(caseless::default_case_fold_str(s)),
Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c),
})
.collect::<Wtf8Buf>()
.into(),
} }
} }
@@ -723,7 +714,7 @@ impl PyStr {
match self.as_str_kind() { match self.as_str_kind() {
PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(), PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(),
PyKindStr::Utf8(s) => s.to_uppercase().into(), PyKindStr::Utf8(s) => s.to_uppercase().into(),
PyKindStr::Wtf8(w) => w.to_uppercase().into(), PyKindStr::Wtf8(w) => rustpython_unicode::case::to_uppercase_wtf8(w).into(),
} }
} }
@@ -966,7 +957,7 @@ impl PyStr {
#[pymethod] #[pymethod]
fn isdecimal(&self) -> bool { fn isdecimal(&self) -> bool {
!self.data.is_empty() !self.data.is_empty()
&& self.char_all(|c| GeneralCategory::of(c) == GeneralCategory::DecimalNumber) && self.char_all(|c| rustpython_unicode::classify::is_decimal(c as u32))
} }
fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> { fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
@@ -1086,17 +1077,12 @@ impl PyStr {
#[pymethod] #[pymethod]
fn isprintable(&self) -> bool { fn isprintable(&self) -> bool {
self.char_all(|c| c == '\u{0020}' || rustpython_literal::char::is_printable(c)) self.char_all(|c| rustpython_unicode::classify::is_printable(c as u32))
} }
#[pymethod] #[pymethod]
fn isspace(&self) -> bool { fn isspace(&self) -> bool {
use unic_ucd_bidi::bidi_class::abbr_names::*; !self.data.is_empty() && self.char_all(|c| rustpython_unicode::classify::is_space(c as u32))
!self.data.is_empty()
&& self.char_all(|c| {
GeneralCategory::of(c) == GeneralCategory::SpaceSeparator
|| matches!(BidiClass::of(c), WS | B | S)
})
} }
// Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise. // Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise.
@@ -1353,11 +1339,8 @@ impl PyStr {
#[pymethod] #[pymethod]
pub fn isidentifier(&self) -> bool { pub fn isidentifier(&self) -> bool {
let Some(s) = self.to_str() else { return false }; self.to_str()
let mut chars = s.chars(); .is_some_and(rustpython_unicode::identifier::is_python_identifier)
let is_identifier_start = chars.next().is_some_and(|c| c == '_' || is_xid_start(c));
// a string is not an identifier if it has whitespace or starts with a number
is_identifier_start && chars.all(is_xid_continue)
} }
// https://docs.python.org/3/library/stdtypes.html#str.translate // https://docs.python.org/3/library/stdtypes.html#str.translate

View File

@@ -21,10 +21,8 @@ mod _sre {
use crossbeam_utils::atomic::AtomicCell; use crossbeam_utils::atomic::AtomicCell;
use itertools::Itertools; use itertools::Itertools;
use num_traits::ToPrimitive; use num_traits::ToPrimitive;
use rustpython_sre_engine::{ use rustpython_sre_engine::{Request, SearchIter, SreFlag, State, StrDrive};
Request, SearchIter, SreFlag, State, StrDrive, use rustpython_unicode::regex as unicode_regex;
string::{lower_ascii, lower_unicode, upper_unicode},
};
#[pyattr] #[pyattr]
pub use rustpython_sre_engine::{CODESIZE, MAXGROUPS, MAXREPEAT, SRE_MAGIC as MAGIC}; pub use rustpython_sre_engine::{CODESIZE, MAXGROUPS, MAXREPEAT, SRE_MAGIC as MAGIC};
@@ -42,17 +40,17 @@ mod _sre {
#[pyfunction] #[pyfunction]
fn unicode_iscased(ch: i32) -> bool { fn unicode_iscased(ch: i32) -> bool {
let ch = ch as u32; let ch = ch as u32;
ch != lower_unicode(ch) || ch != upper_unicode(ch) ch != unicode_regex::lower_unicode(ch) || ch != unicode_regex::upper_unicode(ch)
} }
#[pyfunction] #[pyfunction]
fn ascii_tolower(ch: i32) -> i32 { fn ascii_tolower(ch: i32) -> i32 {
lower_ascii(ch as u32) as i32 unicode_regex::lower_ascii(ch as u32) as i32
} }
#[pyfunction] #[pyfunction]
fn unicode_tolower(ch: i32) -> i32 { fn unicode_tolower(ch: i32) -> i32 {
lower_unicode(ch as u32) as i32 unicode_regex::lower_unicode(ch as u32) as i32
} }
trait SreStr: StrDrive { trait SreStr: StrDrive {

View File

@@ -94,7 +94,7 @@ pub struct VirtualMachine {
pub initialized: bool, pub initialized: bool,
recursion_depth: Cell<usize>, recursion_depth: Cell<usize>,
/// C stack soft limit for detecting stack overflow (like c_stack_soft_limit) /// C stack soft limit for detecting stack overflow (like c_stack_soft_limit)
#[cfg_attr(miri, allow(dead_code))] #[cfg_attr(any(miri, target_env = "musl"), allow(dead_code))]
c_stack_soft_limit: Cell<usize>, c_stack_soft_limit: Cell<usize>,
/// Async generator firstiter hook (per-thread, set via sys.set_asyncgen_hooks) /// Async generator firstiter hook (per-thread, set via sys.set_asyncgen_hooks)
pub async_gen_firstiter: RefCell<Option<PyObjectRef>>, pub async_gen_firstiter: RefCell<Option<PyObjectRef>>,
@@ -1424,12 +1424,12 @@ impl VirtualMachine {
/// Stack margin bytes (like _PyOS_STACK_MARGIN_BYTES). /// Stack margin bytes (like _PyOS_STACK_MARGIN_BYTES).
/// 2048 * sizeof(void*) = 16KB for 64-bit. /// 2048 * sizeof(void*) = 16KB for 64-bit.
#[cfg_attr(miri, allow(dead_code))] #[cfg_attr(any(miri, target_env = "musl"), allow(dead_code))]
const STACK_MARGIN_BYTES: usize = 2048 * core::mem::size_of::<usize>(); const STACK_MARGIN_BYTES: usize = 2048 * core::mem::size_of::<usize>();
/// Get the stack boundaries using platform-specific APIs. /// Get the stack boundaries using platform-specific APIs.
/// Returns (base, top) where base is the lowest address and top is the highest. /// Returns (base, top) where base is the lowest address and top is the highest.
#[cfg(all(not(miri), windows))] #[cfg(all(not(miri), not(target_env = "musl"), windows))]
fn get_stack_bounds() -> (usize, usize) { fn get_stack_bounds() -> (usize, usize) {
use windows_sys::Win32::System::Threading::{ use windows_sys::Win32::System::Threading::{
GetCurrentThreadStackLimits, SetThreadStackGuarantee, GetCurrentThreadStackLimits, SetThreadStackGuarantee,
@@ -1448,7 +1448,7 @@ impl VirtualMachine {
/// Get stack boundaries on non-Windows platforms. /// Get stack boundaries on non-Windows platforms.
/// Falls back to estimating based on current stack pointer. /// Falls back to estimating based on current stack pointer.
#[cfg(all(not(miri), not(windows)))] #[cfg(all(not(miri), not(target_env = "musl"), not(windows)))]
fn get_stack_bounds() -> (usize, usize) { fn get_stack_bounds() -> (usize, usize) {
// Use pthread_attr_getstack on platforms that support it // Use pthread_attr_getstack on platforms that support it
#[cfg(any(target_os = "linux", target_os = "android"))] #[cfg(any(target_os = "linux", target_os = "android"))]
@@ -1499,15 +1499,16 @@ impl VirtualMachine {
/// Calculate the C stack soft limit based on actual stack boundaries. /// Calculate the C stack soft limit based on actual stack boundaries.
/// soft_limit = base + 2 * margin (for downward-growing stacks) /// soft_limit = base + 2 * margin (for downward-growing stacks)
#[cfg(not(miri))] #[cfg(all(not(miri), not(target_env = "musl")))]
fn calculate_c_stack_soft_limit() -> usize { fn calculate_c_stack_soft_limit() -> usize {
let (base, _top) = Self::get_stack_bounds(); let (base, _top) = Self::get_stack_bounds();
// Soft limit is 2 margins above the base
base + Self::STACK_MARGIN_BYTES * 2 base + Self::STACK_MARGIN_BYTES * 2
} }
/// Miri doesn't support inline assembly, so disable C stack checking. /// Musl currently reports stack bounds in a way that trips the VM's
#[cfg(miri)] /// native stack guard during frozen stdlib bootstrap, so keep the Python
/// recursion limit as the only guard there.
#[cfg(any(miri, target_env = "musl"))]
fn calculate_c_stack_soft_limit() -> usize { fn calculate_c_stack_soft_limit() -> usize {
0 0
} }
@@ -1515,19 +1516,18 @@ impl VirtualMachine {
/// Check if we're near the C stack limit (like _Py_MakeRecCheck). /// Check if we're near the C stack limit (like _Py_MakeRecCheck).
/// Returns true only when stack pointer is in the "danger zone" between /// Returns true only when stack pointer is in the "danger zone" between
/// soft_limit and hard_limit (soft_limit - 2*margin). /// soft_limit and hard_limit (soft_limit - 2*margin).
#[cfg(not(miri))] #[cfg(all(not(miri), not(target_env = "musl")))]
#[inline(always)] #[inline(always)]
fn check_c_stack_overflow(&self) -> bool { fn check_c_stack_overflow(&self) -> bool {
let current_sp = psm::stack_pointer() as usize; let current_sp = psm::stack_pointer() as usize;
let soft_limit = self.c_stack_soft_limit.get(); let soft_limit = self.c_stack_soft_limit.get();
// Stack grows downward: check if we're below soft limit but above hard limit
// This matches CPython's _Py_MakeRecCheck behavior
current_sp < soft_limit current_sp < soft_limit
&& current_sp >= soft_limit.saturating_sub(Self::STACK_MARGIN_BYTES * 2) && current_sp >= soft_limit.saturating_sub(Self::STACK_MARGIN_BYTES * 2)
} }
/// Miri doesn't support inline assembly, so always return false. /// Miri does not support the native stack probe, and musl currently trips
#[cfg(miri)] /// the probe during stdlib bootstrap.
#[cfg(any(miri, target_env = "musl"))]
#[inline(always)] #[inline(always)]
fn check_c_stack_overflow(&self) -> bool { fn check_c_stack_overflow(&self) -> bool {
false false

View File

@@ -61,6 +61,9 @@ assert a.endswith(("A", "lo"))
assert not a.endswith("on") assert not a.endswith("on")
assert not a.endswith(("A", "ll")) assert not a.endswith(("A", "ll"))
assert a.zfill(8) == "000Hallo" assert a.zfill(8) == "000Hallo"
assert "á".zfill(4) == "000á"
assert "🙂".zfill(5) == "0000🙂"
assert "+あ".zfill(5) == "+000あ"
assert a.isalnum() assert a.isalnum()
assert not a.isdigit() assert not a.isdigit()
assert not a.isdecimal() assert not a.isdecimal()

View File

@@ -0,0 +1,20 @@
import re
import unicodedata
assert "유니코드".isidentifier()
assert "".isdecimal()
assert "\u3000".isspace()
assert " ".isprintable()
assert not "\n".isprintable()
assert unicodedata.category("\ud800") == "Cs"
assert unicodedata.lookup("SNOWMAN") == ""
assert unicodedata.name("") == "SNOWMAN"
assert unicodedata.normalize("NFC", "e\u0301") == "é"
assert unicodedata.digit("²") == 2
assert unicodedata.decimal("") == 5
assert unicodedata.numeric("") == 1 / 3
assert re.fullmatch(r"\w+", "가나다")
assert re.fullmatch(r"\d+", "५६७")
assert re.fullmatch(r"\s+", "\u3000")

View File

@@ -3266,9 +3266,9 @@
} }
}, },
"node_modules/lodash": { "node_modules/lodash": {
"version": "4.17.23", "version": "4.18.1",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.23.tgz", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.18.1.tgz",
"integrity": "sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==", "integrity": "sha512-dMInicTPVE8d1e5otfwmmjlxkZoUpiVLwyeTdUsi/Caj/gfzzblBcCE5sRHV/AsjuCmxWrte2TNGSYuCeCq+0Q==",
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },