Coverage for python/lum/clu/processors/sentence.py: 97%
34 statements
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
1from __future__ import annotations
2from pydantic import BaseModel, ConfigDict, Field, model_validator
3from lum.clu.processors.directed_graph import DirectedGraph
4from lum.clu.processors.utils import Labels
5import typing
7__all__ = ["Sentence"]
9class Sentence(BaseModel):
11 UNKNOWN: typing.ClassVar[str] = Labels.UNKNOWN
12 # the O in IOB notation
13 O: typing.ClassVar[str] = Labels.O
15 """
16 Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)
17 """
19 model_config = ConfigDict(populate_by_name=True)
21 text: typing.Optional[str] = Field(default=None, description=" The text of the `Sentence`.", exclude=True)
23 raw: list[str] = Field(description="Raw tokens in this sentence; these are expected to match the original text")
25 words: list[str] = Field(description="A list of the `Sentence`'s tokens.")
27 start_offsets: list[int] = Field(alias="startOffsets", description="The character offsets starting each token (inclusive).")
29 end_offsets: list[int] = Field(alias="endOffsets", description="The character offsets marking the end of each token (exclusive).")
31 tags: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.")
33 lemmas: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using lemmas.")
35 norms: typing.Optional[list[str]] = Field(default=None, description="Normalized values of named/numeric entities, such as dates.")
37 chunks: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).")
39 entities: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.")
41 graphs: dict[str, DirectedGraph] = Field(description="A dictionary (str -> `lum.clu.processors.doc.DirectedGraph`) mapping the graph type/name to a `lum.clu.processors.doc.DirectedGraph`.")
43 @model_validator(mode="before")
44 @classmethod
45 def raw_or_words(cls, data: typing.Any) -> typing.Any:
46 """if `raw` is not present, use `words` in its place."""
47 if isinstance(data, dict):
48 words = data.get("words", None)
49 raw = data.get("raw", None)
50 if raw is None:
51 data["raw"] = words
52 return data
54 # length : int
55 # The number of tokens in the `Sentence`
57 # basic_dependencies : lum.clu.processors.doc.DirectedGraph
58 # A `lum.clu.processors.doc.DirectedGraph` using basic Stanford dependencies.
60 # collapsed_dependencies : lum.clu.processors.doc.DirectedGraph
61 # A `lum.clu.processors.doc.DirectedGraph` using collapsed Stanford dependencies.
63 # dependencies : lum.clu.processors.doc.DirectedGraph
64 # A pointer to the prefered syntactic dependency graph type for this `Sentence`.
66 # _entities : [str]
67 # The IOB-style Named Entity (NE) labels corresponding to each token.
69 # _chunks : [str]
70 # The IOB-style chunk labels corresponding to each token.
72 # nes : dict
73 # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`
75 # phrases : dict
76 # A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`
79 # Methods
80 # -------
81 # bag_of_labeled_dependencies_using(form)
82 # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
84 # bag_of_unlabeled_dependencies_using(form)
85 # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
87 @property
88 def length(self) -> int:
89 return len(self.raw)
92 # self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)
93 # self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)
94 # self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies
95 # # IOB tokens -> {label: [phrase 1, ..., phrase n]}
96 # self.nes = self._handle_iob(self._entities)
97 # self.phrases = self._handle_iob(self._chunks)
99 # def __eq__(self, other):
100 # if isinstance(other, self.__class__):
101 # return self.to_JSON() == other.to_JSON()
102 # else:
103 # return False
105 # def __ne__(self, other):
106 # return not self.__eq__(other)
108 # def __hash__(self):
109 # return hash(self.to_JSON(pretty=False))
111 # def deduplication_hash(self):
112 # """
113 # Generates a deduplication hash for the sentence
114 # """
115 # return hashlib.sha256(self.to_JSON(pretty=False).encode()).hexdigest()
117 # def _get_tokens(self, form):
118 # f = form.lower()
119 # if f == "words":
120 # tokens = self.words
121 # elif f == "tags":
122 # tokens = self.tags
123 # elif f == "lemmas":
124 # tokens = self.lemmas
125 # elif f == "entities":
126 # tokens = self.nes
127 # elif f == "index":
128 # tokens = list(range(self.length))
129 # # unrecognized form
130 # else:
131 # raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""")
132 # return tokens
134 # def _set_toks(self, toks):
135 # return toks if toks else [Sentence.UNKNOWN]*self.length
137 # def _handle_iob(self, iob):
138 # """
139 # Consolidates consecutive tokens in IOB notation under the appropriate label.
140 # Regexs control for bionlp annotator, which uses IOB notation.
141 # """
142 # entity_dict = defaultdict(list)
143 # # initialize to empty label
144 # current = Sentence.O
145 # start = None
146 # end = None
147 # for i, tok in enumerate(iob):
148 # # we don't have an I or O
149 # if tok == Sentence.O:
150 # # did we have an entity with the last token?
151 # current = re.sub('(B-|I-)','', str(current))
152 # if current == Sentence.O:
153 # continue
154 # else:
155 # # the last sequence has ended
156 # end = i
157 # # store the entity
158 # named_entity = ' '.join(self.words[start:end])
159 # entity_dict[current].append(named_entity)
160 # # reset our book-keeping vars
161 # current = Sentence.O
162 # start = None
163 # end = None
164 # # we have a tag!
165 # else:
166 # # our old sequence continues
167 # current = re.sub('(B-|I-)','', str(current))
168 # tok = re.sub('(B-|I-)','', str(tok))
169 # if tok == current:
170 # end = i
171 # # our old sequence has ended
172 # else:
173 # # do we have a previous NE?
174 # if current != Sentence.O:
175 # end = i
176 # named_entity = ' '.join(self.words[start:end])
177 # entity_dict[current].append(named_entity)
178 # # update our book-keeping vars
179 # current = tok
180 # start = i
181 # end = None
182 # # this might be empty
183 # return entity_dict
186 # def bag_of_labeled_dependencies_using(self, form):
187 # """
188 # Produces a list of syntactic dependencies
189 # where each edge is labeled with its grammatical relation.
190 # """
191 # tokens = self._get_tokens(form)
192 # return self.labeled_dependencies_from_tokens(tokens) if tokens else None
194 # def bag_of_unlabeled_dependencies_using(self, form):
195 # """
196 # Produces a list of syntactic dependencies
197 # where each edge is left unlabeled without its grammatical relation.
198 # """
199 # tokens = self._get_tokens(form)
200 # return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None
202 # def labeled_dependencies_from_tokens(self, tokens):
203 # """
204 # Generates a list of labeled dependencies for a sentence
205 # using the provided tokens
206 # """
207 # deps = self.dependencies
208 # labeled = []
209 # return [(tokens[out], rel, tokens[dest]) \
210 # for out in deps.outgoing \
211 # for (dest, rel) in deps.outgoing[out]]
213 # def unlabeled_dependencies_from_tokens(self, tokens):
214 # """
215 # Generate a list of unlabeled dependencies for a sentence
216 # using the provided tokens
217 # """
218 # return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)]
220 # def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):
221 # return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices)