Skip to content

Commit 3a7f64c

Browse files
fix(extensions): use explicit UTF-8 encoding when reading manifest YAML (#2370)
* fix(extensions): use explicit UTF-8 encoding when reading manifest YAML On Windows, Python's open() defaults to the system locale encoding (e.g., GBK on Chinese Windows), which causes UnicodeDecodeError when extension.yml or preset.yml contains non-ASCII content such as Chinese characters in description fields. Add encoding='utf-8' to ExtensionManifest._load_yaml and PresetManifest._load_yaml so manifests are read consistently across platforms. Fixes #2325 * test(extensions,presets): add UTF-8 manifest regression tests for #2325 Positive: extension.yml/preset.yml with non-ASCII (Chinese + emoji) descriptions load correctly when written as UTF-8 bytes — fails on Windows without explicit encoding='utf-8'. Negative: files containing invalid UTF-8 bytes raise a clean error (ValidationError or UnicodeDecodeError), not a silent crash. * fix(extensions,presets): wrap I/O and decode errors as ValidationError Address remaining Copilot concerns on #2370: - Catch UnicodeDecodeError and OSError in both manifest loaders and re-raise as ValidationError / PresetValidationError so callers see a consistent error type, not a bare decode/IO traceback. - Validate that PresetManifest YAML root is a mapping (extensions.py already had this; presets.py was missing it). Treat None as {} for empty-file compatibility. - Tighten the negative regression tests to assert the specific message, and add a non-mapping-root test for PresetManifest matching the existing one for ExtensionManifest.
1 parent 171b65a commit 3a7f64c

4 files changed

Lines changed: 83 additions & 3 deletions

File tree

src/specify_cli/extensions.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,18 @@ def __init__(self, manifest_path: Path):
139139
def _load_yaml(self, path: Path) -> dict:
140140
"""Load YAML file safely."""
141141
try:
142-
with open(path, 'r') as f:
142+
with open(path, 'r', encoding='utf-8') as f:
143143
data = yaml.safe_load(f)
144144
except yaml.YAMLError as e:
145145
raise ValidationError(f"Invalid YAML in {path}: {e}")
146146
except FileNotFoundError:
147147
raise ValidationError(f"Manifest not found: {path}")
148+
except UnicodeDecodeError as e:
149+
raise ValidationError(
150+
f"Manifest is not valid UTF-8: {path} ({e.reason} at byte {e.start})"
151+
)
152+
except OSError as e:
153+
raise ValidationError(f"Could not read manifest {path}: {e}")
148154
if not isinstance(data, dict):
149155
raise ValidationError(
150156
f"Manifest must be a YAML mapping, got {type(data).__name__}: {path}"

src/specify_cli/presets.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,25 @@ def __init__(self, manifest_path: Path):
136136
def _load_yaml(self, path: Path) -> dict:
137137
"""Load YAML file safely."""
138138
try:
139-
with open(path, 'r') as f:
140-
return yaml.safe_load(f) or {}
139+
with open(path, 'r', encoding='utf-8') as f:
140+
data = yaml.safe_load(f)
141141
except yaml.YAMLError as e:
142142
raise PresetValidationError(f"Invalid YAML in {path}: {e}")
143143
except FileNotFoundError:
144144
raise PresetValidationError(f"Manifest not found: {path}")
145+
except UnicodeDecodeError as e:
146+
raise PresetValidationError(
147+
f"Manifest is not valid UTF-8: {path} ({e.reason} at byte {e.start})"
148+
)
149+
except OSError as e:
150+
raise PresetValidationError(f"Could not read manifest {path}: {e}")
151+
if data is None:
152+
return {}
153+
if not isinstance(data, dict):
154+
raise PresetValidationError(
155+
f"Manifest must be a YAML mapping, got {type(data).__name__}: {path}"
156+
)
157+
return data
145158

146159
def _validate(self):
147160
"""Validate manifest structure and required fields."""

tests/test_extensions.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,35 @@ def test_non_mapping_yaml_raises_validation_error(self, temp_dir):
225225
with pytest.raises(ValidationError, match="YAML mapping"):
226226
ExtensionManifest(manifest_path)
227227

228+
def test_utf8_non_ascii_description_loads(self, temp_dir, valid_manifest_data):
229+
"""Regression for #2325: non-ASCII (UTF-8) description loads on any platform.
230+
231+
On Windows, Python's default text-mode encoding is the locale codepage
232+
(e.g. cp1252/GBK), which raises UnicodeDecodeError on UTF-8 bytes
233+
outside the ASCII range. The loader must open with encoding='utf-8'.
234+
"""
235+
import yaml
236+
237+
valid_manifest_data["extension"]["description"] = "中文测试 — émojis 🚀"
238+
manifest_path = temp_dir / "extension.yml"
239+
# Write UTF-8 bytes explicitly so the test exercises the read path,
240+
# not the (locale-dependent) write path.
241+
manifest_path.write_bytes(
242+
yaml.safe_dump(valid_manifest_data, allow_unicode=True).encode("utf-8")
243+
)
244+
245+
manifest = ExtensionManifest(manifest_path)
246+
assert manifest.description == "中文测试 — émojis 🚀"
247+
248+
def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir):
249+
"""Negative case: file containing invalid UTF-8 bytes raises ValidationError, not raw UnicodeDecodeError."""
250+
manifest_path = temp_dir / "extension.yml"
251+
# 0xFF/0xFE are not valid UTF-8 lead bytes.
252+
manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n")
253+
254+
with pytest.raises(ValidationError, match="not valid UTF-8"):
255+
ExtensionManifest(manifest_path)
256+
228257
def test_invalid_extension_id(self, temp_dir, valid_manifest_data):
229258
"""Test manifest with invalid extension ID format."""
230259
import yaml

tests/test_presets.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,38 @@ def test_invalid_yaml(self, temp_dir):
160160
with pytest.raises(PresetValidationError, match="Invalid YAML"):
161161
PresetManifest(bad_file)
162162

163+
def test_utf8_non_ascii_description_loads(self, temp_dir, valid_pack_data):
164+
"""Regression for #2325: non-ASCII (UTF-8) description loads on any platform.
165+
166+
On Windows, Python's default text-mode encoding is the locale codepage
167+
(e.g. cp1252/GBK), which raises UnicodeDecodeError on UTF-8 bytes
168+
outside the ASCII range. The loader must open with encoding='utf-8'.
169+
"""
170+
valid_pack_data["preset"]["description"] = "中文测试 — émojis 🚀"
171+
manifest_path = temp_dir / "preset.yml"
172+
manifest_path.write_bytes(
173+
yaml.safe_dump(valid_pack_data, allow_unicode=True).encode("utf-8")
174+
)
175+
176+
manifest = PresetManifest(manifest_path)
177+
assert manifest.description == "中文测试 — émojis 🚀"
178+
179+
def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir):
180+
"""Negative case: file containing invalid UTF-8 bytes raises PresetValidationError, not raw UnicodeDecodeError."""
181+
manifest_path = temp_dir / "preset.yml"
182+
manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n")
183+
184+
with pytest.raises(PresetValidationError, match="not valid UTF-8"):
185+
PresetManifest(manifest_path)
186+
187+
def test_non_mapping_yaml_raises_validation_error(self, temp_dir):
188+
"""Manifest whose YAML root is a scalar or list raises PresetValidationError, not TypeError."""
189+
manifest_path = temp_dir / "preset.yml"
190+
for bad_content in ("42\n", "[1, 2]\n"):
191+
manifest_path.write_text(bad_content, encoding="utf-8")
192+
with pytest.raises(PresetValidationError, match="YAML mapping"):
193+
PresetManifest(manifest_path)
194+
163195
def test_missing_schema_version(self, temp_dir, valid_pack_data):
164196
"""Test missing schema_version field."""
165197
del valid_pack_data["schema_version"]

0 commit comments

Comments
 (0)