Coverage for python/lum/clu/odin/mention.py: 63%

141 statements  

« prev     ^ index     » next       coverage.py v7.6.7, created at 2024-11-17 18:41 +0000

1# -*- coding: utf-8 -*- 

2from __future__ import annotations 

3from pydantic import BaseModel, Field 

4from lum.clu.processors.document import Document 

5from lum.clu.processors.sentence import Sentence 

6from lum.clu.processors.interval import Interval 

7from lum.clu.odin.synpath import SynPath 

8import re 

9import typing 

10 

11 

12__all__ = ["Mention", "TextBoundMention", "RelationMention", "EventMention", "CrossSentenceMention"] 

13 

14# MentionTypes = typing.Union[TextBoundMention, EventMention, RelationMention, CrossSentenceMention] 

15 

16class Mention(BaseModel): 

17 

18 Paths: typing.ClassVar[typing.TypeAlias] = dict[str, dict["Mention", SynPath]] 

19 

20 Arguments: typing.ClassVar[typing.TypeAlias] = dict[str, list["Mention"]] 

21 # FIXME: add validation that this is non-empty? 

22 labels: list[str] = Field(description="A sequence of labels for this mention. The first label in the sequence is considered the default.") 

23 # alias="tokenInterval",  

24 # TODO: consider adding https://docs.pydantic.dev/latest/api/config/#pydantic.config.ConfigDict.populate_by_name 

25 token_interval: Interval = Field(description="The interval of token indicess that form this mention.") 

26 # alias="sentence",  

27 sentence_index: int = Field(description="The index of the sentence where this mention occurs.") 

28 

29 document: Document = Field(description="The document where this mention occurs") 

30 

31 keep: bool = Field(default=True, description="Should we report this mention at the end?") 

32 

33 arguments: typing.Optional[Mention.Arguments] = Field(default=None, description="A map from argument name to a sequence of mentions. The value of the map is a sequence because there are events that can have several arguments with the same name. For example, in the biodomain, Binding may have several themes.") 

34 

35 paths: typing.Optional[Paths] = Field(default=None, description="Graph traversal leading to each argument") 

36 # alias="foundBy" 

37 found_by: str = Field(default="unknown", description="The name of the rule that produced this mention") 

38 

39 def copy( 

40 self, 

41 maybe_labels: typing.Optional[list[str]] = None, 

42 maybe_token_interval: typing.Optional[Interval] = None, 

43 maybe_sentence_index: typing.Optional[int] = None, 

44 maybe_document: typing.Optional[Document] = None, 

45 maybe_keep: typing.Optional[bool] = None, 

46 maybe_arguments: typing.Optional[Mention.Arguments] = None, 

47 maybe_paths: typing.Optional[Mention.Paths] = None, 

48 maybe_found_by: typing.Optional[str] = None, 

49 ) -> Mention: 

50 return Mention( 

51 labels = maybe_labels or self.labels, 

52 token_interval = maybe_token_interval or self.token_interval, 

53 sentence_index = maybe_sentence_index or self.sentence_index, 

54 document = maybe_document or self.document, 

55 keep = maybe_keep or self.keep, 

56 arguments = maybe_arguments or self.arguments, 

57 paths = maybe_paths or self.paths, 

58 found_by = maybe_found_by or self.found_by 

59 ) 

60 

61 @property 

62 def label(self) -> str: 

63 """the first label for the mention""" 

64 return self.labels[0] 

65 

66 @property 

67 def start(self) -> int: 

68 """index of the first token in the mention""" 

69 return self.token_interval.start 

70 

71 @property 

72 def end(self) -> int: 

73 """one after the last token in the mention""" 

74 return self.token_interval.end 

75 

76 @property 

77 def sentence_obj(self) -> Sentence: 

78 return self.document.sentences[self.sentence_index] 

79 

80 @property 

81 def sentenceObj(self) -> Sentence: 

82 self.sentence_obj 

83 

84 @property 

85 def start_offset(self) -> int: 

86 """character offset of the mention beginning""" 

87 return self.sentence_obj.start_offsets[self.start] 

88 

89 @property 

90 def startOffset(self) -> int: 

91 """character offset of the mention beginning""" 

92 return self.start_offset 

93 

94 @property 

95 def char_start_offset(self) -> int: 

96 """character offset of the mention beginning""" 

97 return self.start_offset 

98 

99 @property 

100 def end_offset(self) -> int: 

101 """character offset of the mention end""" 

102 return self.sentence_obj.end_offsets[self.end - 1] 

103 

104 @property 

105 def endOffset(self) -> int: 

106 """character offset of the mention end""" 

107 return self.end_offset 

108 

109 @property 

110 def char_end_offset(self) -> int: 

111 """character offset of the mention end""" 

112 return self.end_offset 

113 

114 @property 

115 def is_valid(self) -> bool: 

116 """returns true if this is a valid mention""" 

117 return True 

118 

119 @property 

120 def isValid(self) -> bool: 

121 """returns true if this is a valid mention""" 

122 return self.is_valid 

123 

124 def matches(self, label_or_pattern: typing.Union[str, re.Pattern]) -> bool: 

125 """returns true if `label_or_pattern` matches any of the mention labels""" 

126 if isinstance(label_or_pattern, str): 

127 return label_or_pattern in self.labels 

128 elif isinstance(label_or_pattern, re.Pattern): 

129 patt = label_or_pattern 

130 return True if any(re.match(patt, lbl) != None for lbl in self.labels) else False 

131 return False 

132 

133 @property 

134 def raw(self) -> list[str]: 

135 """returns all raw (original, no processing applied) tokens in mention""" 

136 return self.sentence_obj.raw[self.start:self.end] 

137 

138 @property 

139 def words(self) -> list[str]: 

140 """returns all tokens in mention""" 

141 return self.sentence_obj.words[self.start:self.end] 

142 

143 @property 

144 def tags(self) -> typing.Optional[list[str]]: 

145 """returns all tags in mention""" 

146 if self.sentence_obj.tags: 

147 return self.sentence_obj.tags[self.start:self.end] 

148 return None 

149 

150 @property 

151 def lemmas(self) -> typing.Optional[list[str]]: 

152 """returns all lemmas in mention""" 

153 if self.sentence_obj.lemmas: 

154 return self.sentence_obj.lemmas[self.start:self.end] 

155 return None 

156 

157 @property 

158 def entities(self) -> typing.Optional[list[str]]: 

159 """returns all entities in mention""" 

160 if self.sentence_obj.entities: 

161 return self.sentence_obj.entities[self.start:self.end] 

162 return None 

163 

164 @property 

165 def norms(self) -> typing.Optional[list[str]]: 

166 """returns all norms in mention""" 

167 if self.sentence_obj.norms: 

168 return self.sentence_obj.norms[self.start:self.end] 

169 return None 

170 

171 @property 

172 def chunks(self) -> typing.Optional[list[str]]: 

173 """returns all chunks in mention""" 

174 if self.sentence_obj.chunks: 

175 return self.sentence_obj.chunks[self.start:self.end] 

176 return None 

177 

178 @property 

179 def text(self) -> str: 

180 """returns a string that contains the mention""" 

181 _text = self.document.text 

182 if _text is not None: 

183 return _text[self.start_offset:self.end_offset] 

184 # FIXME: this can be improved 

185 else: 

186 return " ".join(self.raw[self.start:self.end]) 

187 

188 # /** returns a string that contains the mention */ 

189 # def text: String = document.text match { 

190 # case Some(txt) => txt.slice(startOffset, endOffset) 

191 # case None => 

192 # // try to reconstruct the sentence using the character offsets 

193 # val bits = raw.head +: tokenInterval.tail.map { i => 

194 # val spaces = " " * (sentenceObj.startOffsets(i) - sentenceObj.endOffsets(i - 1)) 

195 # val rawWord = sentenceObj.raw(i) 

196 # spaces + rawWord 

197 # } 

198 # bits.mkString 

199 # } 

200 

201 # /** returns all syntactic heads */ 

202 # def synHeads: Seq[Int] = sentenceObj.dependencies match { 

203 # case Some(deps) => DependencyUtils.findHeads(tokenInterval, deps) 

204 # case None => Nil 

205 # } 

206 

207 # /** returns the minimum distance to a root node for dependencies within the token interval */ 

208 # def distToRootOpt: Option[Int] = sentenceObj.dependencies.flatMap { deps => 

209 # // Note that 

210 # // Double.MaxValue.toInt == Int.MaxValue 

211 # // Double.PositiveInfinity.toInt == Int.MaxValue 

212 # DependencyUtils.distToRootOpt(tokenInterval, deps).map(_.toInt) 

213 # } 

214 

215 # /** returns the syntactic head of `mention` */ 

216 # def synHead: Option[Int] = synHeads.lastOption 

217 

218 # /** returns head token */ 

219 # def synHeadWord: Option[String] = synHead.map(i => sentenceObj.words(i)) 

220 

221 # /** returns head pos tag */ 

222 # def synHeadTag: Option[String] = synHead.flatMap(i => sentenceObj.tags.map(_(i))) 

223 

224 # /** returns head lemma */ 

225 # def synHeadLemma: Option[String] = synHead.flatMap(i => sentenceObj.lemmas.map(_(i))) 

226 

227 # /** returns all semantic heads */ 

228 # def semHeads: Seq[Int] = DependencyUtils.findHeadsStrict(tokenInterval, sentenceObj) 

229 

230 # /** returns the syntactic head of `mention` */ 

231 # def semHead: Option[Int] = semHeads.lastOption 

232 

233 # /** returns head token */ 

234 # def semHeadWord: Option[String] = semHead.map(i => sentenceObj.words(i)) 

235 

236 # /** returns head pos tag */ 

237 # def semHeadTag: Option[String] = semHead.flatMap(i => sentenceObj.tags.map(_(i))) 

238 

239 # /** returns head lemma */ 

240 # def semHeadLemma: Option[String] = semHead.flatMap(i => sentenceObj.lemmas.map(_(i))) 

241 

242 

243 # override def canEqual(a: Any) = a.isInstanceOf[Mention] 

244 

245 # override def equals(that: Any): Boolean = that match { 

246 # case that: Mention => that.canEqual(this) && this.hashCode == that.hashCode 

247 # case _ => false 

248 # } 

249 

250 # def compare(that: Mention): Int = { 

251 # require(this.document == that.document, 

252 # "can't compare mentions if they belong to different documents") 

253 # if (this.sentence < that.sentence) -1 

254 # else if (this.sentence > that.sentence) 1 

255 # else this.tokenInterval compare that.tokenInterval 

256 # } 

257 

258 # def precedes(that: Mention): Boolean = this.compare(that) < 0 

259 

260# class Mention(BaseModel): 

261 

262# TBM: typing.ClassVar[str] = "TextBoundMention" 

263# EM: typing.ClassVar[str] = "EventMention" 

264# RM: typing.ClassVar[str] = "RelationMention" 

265 

266# """ 

267# A labeled span of text. Used to model textual mentions of events, relations, and entities. 

268 

269# Parameters 

270# ---------- 

271# token_interval : Interval 

272# The span of the Mention represented as an Interval. 

273# sentence : int 

274# The sentence index that contains the Mention. 

275# document : Document 

276# The Document in which the Mention was found. 

277# foundBy : str 

278# The Odin IE rule that produced this Mention. 

279# label : str 

280# The label most closely associated with this span. Usually the lowest hyponym of "labels". 

281# labels: list 

282# The list of labels associated with this span. 

283# trigger: dict or None 

284# dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention). 

285# arguments: dict or None 

286# dict of JSON for Mention's arguments. 

287# paths: dict or None 

288# dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules). 

289# doc_id: str or None 

290# the id of the document 

291 

292# Attributes 

293# ---------- 

294# tokenInterval: processors.ds.Interval 

295# An `Interval` encoding the `start` and `end` of the `Mention`. 

296# start : int 

297# The token index that starts the `Mention`. 

298# end : int 

299# The token index that marks the end of the Mention (exclusive). 

300# sentenceObj : processors.ds.Sentence 

301# Pointer to the `Sentence` instance containing the `Mention`. 

302# characterStartOffset: int 

303# The index of the character that starts the `Mention`. 

304# characterEndOffset: int 

305# The index of the character that ends the `Mention`. 

306# type: Mention.TBM or Mention.EM or Mention.RM 

307# The type of the `Mention`. 

308 

309# See Also 

310# -------- 

311 

312# [`Odin` manual](https://arxiv.org/abs/1509.07513) 

313 

314# Methods 

315# ------- 

316# matches(label_pattern) 

317# Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`. 

318 

319# overlaps(other) 

320# Test whether other (token index or Mention) overlaps with span of this Mention. 

321 

322# copy(**kwargs) 

323# Copy constructor for this Mention. 

324 

325# words() 

326# Words for this Mention's span. 

327 

328# tags() 

329# Part of speech for this Mention's span. 

330 

331# lemmas() 

332# Lemmas for this Mention's span. 

333 

334# _chunks() 

335# chunk labels for this Mention's span. 

336 

337# _entities() 

338# NE labels for this Mention's span. 

339# """ 

340 

341 # def __init__(self, 

342 # token_interval, 

343 # sentence, 

344 # document, 

345 # foundBy, 

346 # label, 

347 # labels=None, 

348 # trigger=None, 

349 # arguments=None, 

350 # paths=None, 

351 # keep=True, 

352 # doc_id=None): 

353 

354 # NLPDatum.__init__(self) 

355 # self.label = label 

356 # self.labels = labels if labels else [self.label] 

357 # self.tokenInterval = token_interval 

358 # self.start = self.tokenInterval.start 

359 # self.end = self.tokenInterval.end 

360 # self.document = document 

361 # self._doc_id = doc_id or hash(self.document) 

362 # self.sentence = sentence 

363 # if trigger: 

364 # # NOTE: doc id is not stored for trigger's json, 

365 # # as it is assumed to be contained in the same document as its parent 

366 # trigger.update({"document": self._doc_id}) 

367 # self.trigger = Mention.load_from_JSON(trigger, self._to_document_map()) 

368 # else: 

369 # self.trigger = None 

370 # # unpack args 

371 # self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None 

372 # self.paths = paths 

373 # self.keep = keep 

374 # self.foundBy = foundBy 

375 # # other 

376 # self.sentenceObj = self.document.sentences[self.sentence] 

377 # self.text = " ".join(self.sentenceObj.words[self.start:self.end]) 

378 # # recover offsets 

379 # self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start] 

380 # self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1] 

381 # # for later recovery 

382 # self.id = None 

383 # self.type = self._set_type() 

384 

385 # def __str__(self): 

386 # return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self)) 

387 

388 # def __eq__(self, other): 

389 # if isinstance(other, self.__class__): 

390 # return self.__dict__ == other.__dict__ 

391 # else: 

392 # return False 

393 

394 # def __ne__(self, other): 

395 # return not self.__eq__(other) 

396 

397 # def __hash__(self): 

398 # return hash(self.to_JSON()) 

399 

400 # def startOffset(self): 

401 # return self.sentenceObj.endOffsets[self.start] 

402 

403 # def endOffset(self): 

404 # return self.sentenceObj.endOffsets[self.end -1] 

405 

406 # def words(self): 

407 # return self.sentenceObj.words[self.start:self.end] 

408 

409 # def tags(self): 

410 # return self.sentenceObj.tags[self.start:self.end] 

411 

412 # def lemmas(self): 

413 # return self.sentenceObj.lemmas[self.start:self.end] 

414 

415 # def _chunks(self): 

416 # return self.sentenceObj._chunks[self.start:self.end] 

417 

418 # def _entities(self): 

419 # return self.sentenceObj._entities[self.start:self.end] 

420 

421 # def overlaps(self, other): 

422 # """ 

423 # Checks for overlap. 

424 # """ 

425 # if isinstance(other, int): 

426 # return self.start <= other < self.end 

427 # elif isinstance(other, Mention): 

428 # # equiv. sentences + checks on start and end 

429 # return (self.sentence.__hash__() == other.sentence.__hash__()) and \ 

430 # self.tokenInterval.overlaps(other.tokenInterval) 

431 # else: 

432 # return False 

433 

434 

435 

436 # def _arguments_to_JSON_dict(self): 

437 # return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items()) 

438 

439 # def _paths_to_JSON_dict(self): 

440 # return {role: paths.to_JSON_dict() for (role, paths) in self.paths} 

441 

442 # @staticmethod 

443 # def load_from_JSON(mjson, docs_dict): 

444 # # recover document 

445 # doc_id = mjson["document"] 

446 # doc = docs_dict[doc_id] 

447 # labels = mjson["labels"] 

448 # kwargs = { 

449 # "label": mjson.get("label", labels[0]), 

450 # "labels": labels, 

451 # "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]), 

452 # "sentence": mjson["sentence"], 

453 # "document": doc, 

454 # "doc_id": doc_id, 

455 # "trigger": mjson.get("trigger", None), 

456 # "arguments": mjson.get("arguments", None), 

457 # "paths": mjson.get("paths", None), 

458 # "keep": mjson.get("keep", True), 

459 # "foundBy": mjson["foundBy"] 

460 # } 

461 # m = Mention(**kwargs) 

462 # # set IDs 

463 # m.id = mjson["id"] 

464 # m._doc_id = doc_id 

465 # # set character offsets 

466 # m.character_start_offset = mjson["characterStartOffset"] 

467 # m.character_end_offset = mjson["characterEndOffset"] 

468 # return m 

469 

470 # def _to_document_map(self): 

471 # return {self._doc_id: self.document} 

472 

473 # def _set_type(self): 

474 # # event mention 

475 # if self.trigger != None: 

476 # return Mention.EM 

477 # # textbound mention 

478 # elif self.trigger == None and self.arguments == None: 

479 # return Mention.TBM 

480 # else: 

481 # return Mention.RM 

482 

483 

484class TextBoundMention(Mention): 

485 

486 # override from Mention 

487 arguments: typing.Optional[Mention.Arguments] = Field(default=None, description="A TextBoundMention has no arguments") 

488 paths: typing.Optional[Mention.Paths] = Field(default=None, description="A TextBoundMention has no paths") 

489 

490class RelationMention(Mention): 

491 # FIXME: ensure arguments dict is non-empt 

492 

493 # TODO: implement me 

494 # see https://github.com/clulab/processors/blob/9f89ea7bf6ac551f77dbfdbb8eec9bf216711df4/main/src/main/scala/org/clulab/odin/Mention.scala 

495 @property 

496 def is_valid(self) -> bool: 

497 """returns true if this is a valid mention""" 

498 # args should all be from same sentence 

499 raise NotImplementedError 

500 

501 # TODO: implement me 

502 @property 

503 def to_event_mention(trigger: TextBoundMention) -> "EventMention": 

504 """""" 

505 # check that trigger and self have same sent and doc 

506 raise NotImplementedError 

507 

508 # TODO: implement me 

509 def scatter(arg_name: str, size: int) -> list[RelationMention]: 

510 raise NotImplementedError 

511 # arguments 

512 # .getOrElse(argName, Nil) 

513 # .combinations(size) 

514 # .map(args => this + (argName -> args)) 

515 # .toList 

516 

517 # TODO: implement me 

518 # Create a new EventMention by removing a single argument 

519 def __sub__(other: typing.Any) -> RelationMention: 

520 raise NotImplementedError 

521 #copy(arguments = this.arguments - argName) 

522 # Create a new EventMention by removing a sequence of arguments 

523 # def --(argNames: Seq[String]): EventMention = 

524 # copy(arguments = this.arguments -- argNames) 

525 

526 # TODO: implement me 

527 def __add__(other: typing.Any) -> RelationMention: 

528 """Create a new RelationMention by adding a key, value pair to the arguments map""" 

529 #def +(arg: (String, Seq[Mention])): RelationMention = 

530 #copy(arguments = this.arguments + arg) 

531 raise NotImplementedError 

532 

533class EventMention(Mention): 

534 trigger: TextBoundMention = Field(description="") 

535 arguments: Mention.Arguments = Field(default={}, description="A mapping of the EventMention's arguments (role -> list[Mention])") 

536 paths: typing.Optional[Mention.Paths] = Field(default={}, description="Graph traversal leading to each argument") 

537 

538 def copy( 

539 self, 

540 maybe_trigger: typing.Optional[TextBoundMention] = None, 

541 maybe_labels: typing.Optional[list[str]] = None, 

542 maybe_token_interval: typing.Optional[Interval] = None, 

543 maybe_sentence_index: typing.Optional[int] = None, 

544 maybe_document: typing.Optional[Document] = None, 

545 maybe_keep: typing.Optional[bool] = None, 

546 maybe_arguments: typing.Optional[Mention.Arguments] = None, 

547 maybe_paths: typing.Optional[Mention.Paths] = None, 

548 maybe_found_by: typing.Optional[str] = None, 

549 ) -> EventMention: 

550 return EventMention( 

551 trigger = maybe_trigger or self.trigger, 

552 labels = maybe_labels or self.labels, 

553 token_interval = maybe_token_interval or self.token_interval, 

554 sentence_index = maybe_sentence_index or self.sentence_index, 

555 document = maybe_document or self.document, 

556 keep = maybe_keep or self.keep, 

557 arguments = maybe_arguments or self.arguments, 

558 paths = maybe_paths or self.paths, 

559 found_by = maybe_found_by or self.found_by 

560 ) 

561 

562 # TODO: implement me 

563 # see https://github.com/clulab/processors/blob/9f89ea7bf6ac551f77dbfdbb8eec9bf216711df4/main/src/main/scala/org/clulab/odin/Mention.scala#L323-L330 

564 @property 

565 def is_valid(self) -> bool: 

566 """returns true if this is a valid mention""" 

567 raise NotImplementedError 

568 

569 # TODO: implement me 

570 def to_relation_mention(self) -> RelationMention: 

571 raise NotImplementedError 

572 

573 # TODO: implement me 

574 def scatter(arg_name: str, size: int) -> list[EventMention]: 

575 raise NotImplementedError 

576 # arguments 

577 # .getOrElse(argName, Nil) 

578 # .combinations(size) 

579 # .map(args => this + (argName -> args)) 

580 # .toList 

581 

582 # TODO: implement me 

583 # Create a new EventMention by removing a single argument 

584 def __sub__(other: typing.Any) -> EventMention: 

585 raise NotImplementedError 

586 #copy(arguments = this.arguments - argName) 

587 # Create a new EventMention by removing a sequence of arguments 

588 # def --(argNames: Seq[String]): EventMention = 

589 # copy(arguments = this.arguments -- argNames) 

590 

591 # TODO: implement me 

592 def __add__(other: typing.Any) -> EventMention: 

593 """Create a new EventMention by adding a key, value pair to the arguments map""" 

594 #def +(arg: (String, Seq[Mention])): EventMention = 

595 #copy(arguments = this.arguments + arg) 

596 raise NotImplementedError 

597 

598class CrossSentenceMention(Mention): 

599 anchor: Mention = Field(description="The mention serving as the anchor for this cross-sentence mention") 

600 neighbor: Mention = Field(description="The second mention for this cross-sentence mention") 

601 

602 # FIXME: add check on arguments  

603 #require(arguments.size == 2, "CrossSentenceMention must have exactly two arguments") 

604 # assert anchor.document == neighbor.document 

605 # assert anchor.sentence_obj != neighbor.sentence_obj