Coverage for python/lum/clu/processors/sentence.py: 97%

34 statements  

« prev     ^ index     » next       coverage.py v7.6.7, created at 2024-11-17 18:41 +0000

1from __future__ import annotations 

2from pydantic import BaseModel, ConfigDict, Field, model_validator 

3from lum.clu.processors.directed_graph import DirectedGraph 

4from lum.clu.processors.utils import Labels 

5import typing 

6 

7__all__ = ["Sentence"] 

8 

9class Sentence(BaseModel): 

10 

11 UNKNOWN: typing.ClassVar[str] = Labels.UNKNOWN 

12 # the O in IOB notation 

13 O: typing.ClassVar[str] = Labels.O 

14 

15 """ 

16 Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala) 

17 """ 

18 

19 model_config = ConfigDict(populate_by_name=True) 

20 

21 text: typing.Optional[str] = Field(default=None, description=" The text of the `Sentence`.", exclude=True) 

22 

23 raw: list[str] = Field(description="Raw tokens in this sentence; these are expected to match the original text") 

24 

25 words: list[str] = Field(description="A list of the `Sentence`'s tokens.") 

26 

27 start_offsets: list[int] = Field(alias="startOffsets", description="The character offsets starting each token (inclusive).") 

28 

29 end_offsets: list[int] = Field(alias="endOffsets", description="The character offsets marking the end of each token (exclusive).") 

30 

31 tags: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.") 

32 

33 lemmas: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using lemmas.") 

34 

35 norms: typing.Optional[list[str]] = Field(default=None, description="Normalized values of named/numeric entities, such as dates.") 

36 

37 chunks: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).") 

38 

39 entities: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.") 

40 

41 graphs: dict[str, DirectedGraph] = Field(description="A dictionary (str -> `lum.clu.processors.doc.DirectedGraph`) mapping the graph type/name to a `lum.clu.processors.doc.DirectedGraph`.") 

42 

43 @model_validator(mode="before") 

44 @classmethod 

45 def raw_or_words(cls, data: typing.Any) -> typing.Any: 

46 """if `raw` is not present, use `words` in its place.""" 

47 if isinstance(data, dict): 

48 words = data.get("words", None) 

49 raw = data.get("raw", None) 

50 if raw is None: 

51 data["raw"] = words 

52 return data 

53 

54 # length : int 

55 # The number of tokens in the `Sentence` 

56 

57 # basic_dependencies : lum.clu.processors.doc.DirectedGraph 

58 # A `lum.clu.processors.doc.DirectedGraph` using basic Stanford dependencies. 

59 

60 # collapsed_dependencies : lum.clu.processors.doc.DirectedGraph 

61 # A `lum.clu.processors.doc.DirectedGraph` using collapsed Stanford dependencies. 

62 

63 # dependencies : lum.clu.processors.doc.DirectedGraph 

64 # A pointer to the prefered syntactic dependency graph type for this `Sentence`. 

65 

66 # _entities : [str] 

67 # The IOB-style Named Entity (NE) labels corresponding to each token. 

68 

69 # _chunks : [str] 

70 # The IOB-style chunk labels corresponding to each token. 

71 

72 # nes : dict 

73 # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities` 

74 

75 # phrases : dict 

76 # A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks` 

77 

78 

79 # Methods 

80 # ------- 

81 # bag_of_labeled_dependencies_using(form) 

82 # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation. 

83 

84 # bag_of_unlabeled_dependencies_using(form) 

85 # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation. 

86 

87 @property 

88 def length(self) -> int: 

89 return len(self.raw) 

90 

91 

92 # self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None) 

93 # self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None) 

94 # self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies 

95 # # IOB tokens -> {label: [phrase 1, ..., phrase n]} 

96 # self.nes = self._handle_iob(self._entities) 

97 # self.phrases = self._handle_iob(self._chunks) 

98 

99 # def __eq__(self, other): 

100 # if isinstance(other, self.__class__): 

101 # return self.to_JSON() == other.to_JSON() 

102 # else: 

103 # return False 

104 

105 # def __ne__(self, other): 

106 # return not self.__eq__(other) 

107 

108 # def __hash__(self): 

109 # return hash(self.to_JSON(pretty=False)) 

110 

111 # def deduplication_hash(self): 

112 # """ 

113 # Generates a deduplication hash for the sentence 

114 # """ 

115 # return hashlib.sha256(self.to_JSON(pretty=False).encode()).hexdigest() 

116 

117 # def _get_tokens(self, form): 

118 # f = form.lower() 

119 # if f == "words": 

120 # tokens = self.words 

121 # elif f == "tags": 

122 # tokens = self.tags 

123 # elif f == "lemmas": 

124 # tokens = self.lemmas 

125 # elif f == "entities": 

126 # tokens = self.nes 

127 # elif f == "index": 

128 # tokens = list(range(self.length)) 

129 # # unrecognized form 

130 # else: 

131 # raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""") 

132 # return tokens 

133 

134 # def _set_toks(self, toks): 

135 # return toks if toks else [Sentence.UNKNOWN]*self.length 

136 

137 # def _handle_iob(self, iob): 

138 # """ 

139 # Consolidates consecutive tokens in IOB notation under the appropriate label. 

140 # Regexs control for bionlp annotator, which uses IOB notation. 

141 # """ 

142 # entity_dict = defaultdict(list) 

143 # # initialize to empty label 

144 # current = Sentence.O 

145 # start = None 

146 # end = None 

147 # for i, tok in enumerate(iob): 

148 # # we don't have an I or O 

149 # if tok == Sentence.O: 

150 # # did we have an entity with the last token? 

151 # current = re.sub('(B-|I-)','', str(current)) 

152 # if current == Sentence.O: 

153 # continue 

154 # else: 

155 # # the last sequence has ended 

156 # end = i 

157 # # store the entity 

158 # named_entity = ' '.join(self.words[start:end]) 

159 # entity_dict[current].append(named_entity) 

160 # # reset our book-keeping vars 

161 # current = Sentence.O 

162 # start = None 

163 # end = None 

164 # # we have a tag! 

165 # else: 

166 # # our old sequence continues 

167 # current = re.sub('(B-|I-)','', str(current)) 

168 # tok = re.sub('(B-|I-)','', str(tok)) 

169 # if tok == current: 

170 # end = i 

171 # # our old sequence has ended 

172 # else: 

173 # # do we have a previous NE? 

174 # if current != Sentence.O: 

175 # end = i 

176 # named_entity = ' '.join(self.words[start:end]) 

177 # entity_dict[current].append(named_entity) 

178 # # update our book-keeping vars 

179 # current = tok 

180 # start = i 

181 # end = None 

182 # # this might be empty 

183 # return entity_dict 

184 

185 

186 # def bag_of_labeled_dependencies_using(self, form): 

187 # """ 

188 # Produces a list of syntactic dependencies 

189 # where each edge is labeled with its grammatical relation. 

190 # """ 

191 # tokens = self._get_tokens(form) 

192 # return self.labeled_dependencies_from_tokens(tokens) if tokens else None 

193 

194 # def bag_of_unlabeled_dependencies_using(self, form): 

195 # """ 

196 # Produces a list of syntactic dependencies 

197 # where each edge is left unlabeled without its grammatical relation. 

198 # """ 

199 # tokens = self._get_tokens(form) 

200 # return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None 

201 

202 # def labeled_dependencies_from_tokens(self, tokens): 

203 # """ 

204 # Generates a list of labeled dependencies for a sentence 

205 # using the provided tokens 

206 # """ 

207 # deps = self.dependencies 

208 # labeled = [] 

209 # return [(tokens[out], rel, tokens[dest]) \ 

210 # for out in deps.outgoing \ 

211 # for (dest, rel) in deps.outgoing[out]] 

212 

213 # def unlabeled_dependencies_from_tokens(self, tokens): 

214 # """ 

215 # Generate a list of unlabeled dependencies for a sentence 

216 # using the provided tokens 

217 # """ 

218 # return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)] 

219 

220 # def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None): 

221 # return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices)