Coverage for python/lum/clu/processors/document.py: 100%
23 statements
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
1from __future__ import annotations
2from pydantic import BaseModel, Field, ConfigDict
3from lum.clu.processors.sentence import Sentence
4from lum.clu.processors.utils import Labels
5import typing
8class Document(BaseModel):
10 """
11 Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
12 """
14 model_config = ConfigDict(populate_by_name=True)
16 id: typing.Optional[str] = Field(default=None, description="A unique ID for the `Document`.")
18 text: typing.Optional[str] = Field(default=None, description=" The text of the `Document`.")
20 sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.")
22 @staticmethod
23 def merge_documents(docs: list[Document]) -> Document:
24 """Merges two or more Documents into a single Document."""
25 text = ""
26 sentences = []
27 offset = 0
28 for doc in docs:
29 for old in doc.sentences:
30 s = Sentence(
31 text = old.text,
32 raw = old.raw,
33 words = old.words,
34 startOffsets = [i + offset for i in old.start_offsets],
35 endOffsets = [i + offset for i in old.end_offsets],
36 tags= old.tags,
37 lemmas = old.lemmas,
38 norms = old.norms,
39 chunks = old.chunks,
40 entities = old.entities,
41 graphs = old.graphs
42 )
43 sentences.append(s)
44 if doc.text:
45 text += doc.text
46 offset += len(doc.text)
48 return Document(
49 id = docs[0].id,
50 text=text if len(text) > 0 else None,
51 sentences=sentences
52 )
54 # size : int
55 # The number of `sentences`.
57 # words : [str]
58 # A list of the `Document`'s tokens.
60 # tags : [str]
61 # A list of the `Document`'s tokens represented using part of speech (PoS) tags.
63 # lemmas : [str]
64 # A list of the `Document`'s tokens represented using lemmas.
66 # _entities : [str]
67 # A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.
69 # nes : dict
70 # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.
72 # bag_of_labeled_deps : [str]
73 # The labeled dependencies from all sentences in the `Document`.
75 # bag_of_unlabeled_deps : [str]
76 # The unlabeled dependencies from all sentences in the `Document`.
78 # text : str or None
79 # The original text of the `Document`.
81 # Methods
82 # -------
83 # bag_of_labeled_dependencies_using(form)
84 # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
86 # bag_of_unlabeled_dependencies_using(form)
87 # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
90 # self.nes = merge_entity_dicts = self._merge_ne_dicts()
91 # self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
92 # self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
94 # def __hash__(self):
95 # return hash(self.to_JSON())
97 # def __unicode__(self):
98 # return self.text
100 # def __str__(self):
101 # return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
103 # def __eq__(self, other):
104 # if isinstance(other, self.__class__):
105 # return self.to_JSON() == other.to_JSON()
106 # else:
107 # return False
109 # def __ne__(self, other):
110 # return not self.__eq__(other)
112 # def bag_of_labeled_dependencies_using(self, form):
113 # return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
115 # def bag_of_unlabeled_dependencies_using(self, form):
116 # return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
118 # def _merge_ne_dicts(self):
119 # # Get the set of all NE labels found in the Doc's sentences
120 # entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
121 # # Do we have any labels?
122 # if entity_labels == None:
123 # return None
124 # # If we have labels, consolidate the NEs under the appropriate label
125 # else:
126 # nes_dict = dict()
127 # for e in entity_labels:
128 # entities = []
129 # for s in self.sentences:
130 # entities += s.nes[e]
131 # nes_dict[e] = entities
132 # return nes_dict