Coverage for python/lum/clu/processors/tests/utils.py: 100%
16 statements
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
1from pathlib import Path
2from lum.clu.processors.document import Document as CluDocument
3import json
4import typing
6__all__ = ["load_test_docs", "check_doc_token_alignment"]
9def load_test_docs(filenames: list[str] = ["doc-part-1.json", "doc-part-2.json", "doc-part-3.json"]) -> typing.Iterator[CluDocument]:
10 for filename in filenames:
11 f = Path(__file__).resolve().parent / "data" / filename
12 with open(f, "r") as infile:
13 data = json.load(infile)
14 yield CluDocument(**data)
16def check_doc_token_alignment(doc: CluDocument):
17 for i, s in enumerate(doc.sentences):
18 for raw_tok, start, end in zip(s.raw, s.start_offsets, s.end_offsets):
19 orig_tok = doc.text[start:end]
20 assert orig_tok == raw_tok, f"Expected '{orig_tok}' == '{raw_tok}' for doc[{start}:{end}] and sentence {i} ({' '.join(s.raw)})"