Coverage for python/lum/clu/processors/sentence.py: 97%

1from __future__ import annotations

2from pydantic import BaseModel, ConfigDict, Field, model_validator

3from lum.clu.processors.directed_graph import DirectedGraph

4from lum.clu.processors.utils import Labels

5import typing

7__all__ = ["Sentence"]

9class Sentence(BaseModel):

11 UNKNOWN: typing.ClassVar[str] = Labels.UNKNOWN

12 # the O in IOB notation

13 O: typing.ClassVar[str] = Labels.O

15 """

16 Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)

17 """

19 model_config = ConfigDict(populate_by_name=True)

21 text: typing.Optional[str] = Field(default=None, description=" The text of the `Sentence`.", exclude=True)

23 raw: list[str] = Field(description="Raw tokens in this sentence; these are expected to match the original text")

25 words: list[str] = Field(description="A list of the `Sentence`'s tokens.")

27 start_offsets: list[int] = Field(alias="startOffsets", description="The character offsets starting each token (inclusive).")

29 end_offsets: list[int] = Field(alias="endOffsets", description="The character offsets marking the end of each token (exclusive).")

31 tags: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.")

33 lemmas: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using lemmas.")

35 norms: typing.Optional[list[str]] = Field(default=None, description="Normalized values of named/numeric entities, such as dates.")

37 chunks: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).")

39 entities: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.")

41 graphs: dict[str, DirectedGraph] = Field(description="A dictionary (str -> `lum.clu.processors.doc.DirectedGraph`) mapping the graph type/name to a `lum.clu.processors.doc.DirectedGraph`.")

43 @model_validator(mode="before")

44 @classmethod

45 def raw_or_words(cls, data: typing.Any) -> typing.Any:

46 """if `raw` is not present, use `words` in its place."""

47 if isinstance(data, dict):

48 words = data.get("words", None)

49 raw = data.get("raw", None)

50 if raw is None:

51 data["raw"] = words

52 return data

54 # length : int

55 # The number of tokens in the `Sentence`

57 # basic_dependencies : lum.clu.processors.doc.DirectedGraph

58 # A `lum.clu.processors.doc.DirectedGraph` using basic Stanford dependencies.

60 # collapsed_dependencies : lum.clu.processors.doc.DirectedGraph

61 # A `lum.clu.processors.doc.DirectedGraph` using collapsed Stanford dependencies.

63 # dependencies : lum.clu.processors.doc.DirectedGraph

64 # A pointer to the prefered syntactic dependency graph type for this `Sentence`.

66 # _entities : [str]

67 # The IOB-style Named Entity (NE) labels corresponding to each token.

69 # _chunks : [str]

70 # The IOB-style chunk labels corresponding to each token.

72 # nes : dict

73 # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`

75 # phrases : dict

76 # A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`

79 # Methods

80 # -------

81 # bag_of_labeled_dependencies_using(form)

82 # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.

84 # bag_of_unlabeled_dependencies_using(form)

85 # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.

87 @property

88 def length(self) -> int:

89 return len(self.raw)

92 # self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)

93 # self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)

94 # self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies

95 # # IOB tokens -> {label: [phrase 1, ..., phrase n]}

96 # self.nes = self._handle_iob(self._entities)

97 # self.phrases = self._handle_iob(self._chunks)

99 # def __eq__(self, other):

100 # if isinstance(other, self.__class__):

101 # return self.to_JSON() == other.to_JSON()

102 # else:

103 # return False

104

105 # def __ne__(self, other):

106 # return not self.__eq__(other)

107

108 # def __hash__(self):

109 # return hash(self.to_JSON(pretty=False))

110

111 # def deduplication_hash(self):

112 # """

113 # Generates a deduplication hash for the sentence

114 # """

115 # return hashlib.sha256(self.to_JSON(pretty=False).encode()).hexdigest()

116

117 # def _get_tokens(self, form):

118 # f = form.lower()

119 # if f == "words":

120 # tokens = self.words

121 # elif f == "tags":

122 # tokens = self.tags

123 # elif f == "lemmas":

124 # tokens = self.lemmas

125 # elif f == "entities":

126 # tokens = self.nes

127 # elif f == "index":

128 # tokens = list(range(self.length))

129 # # unrecognized form

130 # else:

131 # raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""")

132 # return tokens

133

134 # def _set_toks(self, toks):

135 # return toks if toks else [Sentence.UNKNOWN]*self.length

136

137 # def _handle_iob(self, iob):

138 # """

139 # Consolidates consecutive tokens in IOB notation under the appropriate label.

140 # Regexs control for bionlp annotator, which uses IOB notation.

141 # """

142 # entity_dict = defaultdict(list)

143 # # initialize to empty label

144 # current = Sentence.O

145 # start = None

146 # end = None

147 # for i, tok in enumerate(iob):

148 # # we don't have an I or O

149 # if tok == Sentence.O:

150 # # did we have an entity with the last token?

151 # current = re.sub('(B-|I-)','', str(current))

152 # if current == Sentence.O:

153 # continue

154 # else:

155 # # the last sequence has ended

156 # end = i

157 # # store the entity

158 # named_entity = ' '.join(self.words[start:end])

159 # entity_dict[current].append(named_entity)

160 # # reset our book-keeping vars

161 # current = Sentence.O

162 # start = None

163 # end = None

164 # # we have a tag!

165 # else:

166 # # our old sequence continues

167 # current = re.sub('(B-|I-)','', str(current))

168 # tok = re.sub('(B-|I-)','', str(tok))

169 # if tok == current:

170 # end = i

171 # # our old sequence has ended

172 # else:

173 # # do we have a previous NE?

174 # if current != Sentence.O:

175 # end = i

176 # named_entity = ' '.join(self.words[start:end])

177 # entity_dict[current].append(named_entity)

178 # # update our book-keeping vars

179 # current = tok

180 # start = i

181 # end = None

182 # # this might be empty

183 # return entity_dict

184

185

186 # def bag_of_labeled_dependencies_using(self, form):

187 # """

188 # Produces a list of syntactic dependencies

189 # where each edge is labeled with its grammatical relation.

190 # """

191 # tokens = self._get_tokens(form)

192 # return self.labeled_dependencies_from_tokens(tokens) if tokens else None

193

194 # def bag_of_unlabeled_dependencies_using(self, form):

195 # """

196 # Produces a list of syntactic dependencies

197 # where each edge is left unlabeled without its grammatical relation.

198 # """

199 # tokens = self._get_tokens(form)

200 # return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None

201

202 # def labeled_dependencies_from_tokens(self, tokens):

203 # """

204 # Generates a list of labeled dependencies for a sentence

205 # using the provided tokens

206 # """

207 # deps = self.dependencies

208 # labeled = []

209 # return [(tokens[out], rel, tokens[dest]) \

210 # for out in deps.outgoing \

211 # for (dest, rel) in deps.outgoing[out]]

212

213 # def unlabeled_dependencies_from_tokens(self, tokens):

214 # """

215 # Generate a list of unlabeled dependencies for a sentence

216 # using the provided tokens

217 # """

218 # return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)]

219

220 # def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):

221 # return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices)