Coverage for python/lum/clu/processors/tests/utils.py: 100%

16 statements  

« prev     ^ index     » next       coverage.py v7.6.7, created at 2024-11-17 18:41 +0000

1from pathlib import Path 

2from lum.clu.processors.document import Document as CluDocument 

3import json 

4import typing 

5 

6__all__ = ["load_test_docs", "check_doc_token_alignment"] 

7 

8 

9def load_test_docs(filenames: list[str] = ["doc-part-1.json", "doc-part-2.json", "doc-part-3.json"]) -> typing.Iterator[CluDocument]: 

10 for filename in filenames: 

11 f = Path(__file__).resolve().parent / "data" / filename 

12 with open(f, "r") as infile: 

13 data = json.load(infile) 

14 yield CluDocument(**data) 

15 

16def check_doc_token_alignment(doc: CluDocument): 

17 for i, s in enumerate(doc.sentences): 

18 for raw_tok, start, end in zip(s.raw, s.start_offsets, s.end_offsets): 

19 orig_tok = doc.text[start:end] 

20 assert orig_tok == raw_tok, f"Expected '{orig_tok}' == '{raw_tok}' for doc[{start}:{end}] and sentence {i} ({' '.join(s.raw)})"