Coverage for python/lum/clu/odin/mention.py: 63%

1# -*- coding: utf-8 -*-

2from __future__ import annotations

3from pydantic import BaseModel, Field

4from lum.clu.processors.document import Document

5from lum.clu.processors.sentence import Sentence

6from lum.clu.processors.interval import Interval

7from lum.clu.odin.synpath import SynPath

8import re

9import typing

12__all__ = ["Mention", "TextBoundMention", "RelationMention", "EventMention", "CrossSentenceMention"]

14# MentionTypes = typing.Union[TextBoundMention, EventMention, RelationMention, CrossSentenceMention]

16class Mention(BaseModel):

18 Paths: typing.ClassVar[typing.TypeAlias] = dict[str, dict["Mention", SynPath]]

20 Arguments: typing.ClassVar[typing.TypeAlias] = dict[str, list["Mention"]]

21 # FIXME: add validation that this is non-empty?

22 labels: list[str] = Field(description="A sequence of labels for this mention. The first label in the sequence is considered the default.")

23 # alias="tokenInterval",

24 # TODO: consider adding https://docs.pydantic.dev/latest/api/config/#pydantic.config.ConfigDict.populate_by_name

25 token_interval: Interval = Field(description="The interval of token indicess that form this mention.")

26 # alias="sentence",

27 sentence_index: int = Field(description="The index of the sentence where this mention occurs.")

29 document: Document = Field(description="The document where this mention occurs")

31 keep: bool = Field(default=True, description="Should we report this mention at the end?")

33 arguments: typing.Optional[Mention.Arguments] = Field(default=None, description="A map from argument name to a sequence of mentions. The value of the map is a sequence because there are events that can have several arguments with the same name. For example, in the biodomain, Binding may have several themes.")

35 paths: typing.Optional[Paths] = Field(default=None, description="Graph traversal leading to each argument")

36 # alias="foundBy"

37 found_by: str = Field(default="unknown", description="The name of the rule that produced this mention")

39 def copy(

40 self,

41 maybe_labels: typing.Optional[list[str]] = None,

42 maybe_token_interval: typing.Optional[Interval] = None,

43 maybe_sentence_index: typing.Optional[int] = None,

44 maybe_document: typing.Optional[Document] = None,

45 maybe_keep: typing.Optional[bool] = None,

46 maybe_arguments: typing.Optional[Mention.Arguments] = None,

47 maybe_paths: typing.Optional[Mention.Paths] = None,

48 maybe_found_by: typing.Optional[str] = None,

49 ) -> Mention:

50 return Mention(

51 labels = maybe_labels or self.labels,

52 token_interval = maybe_token_interval or self.token_interval,

53 sentence_index = maybe_sentence_index or self.sentence_index,

54 document = maybe_document or self.document,

55 keep = maybe_keep or self.keep,

56 arguments = maybe_arguments or self.arguments,

57 paths = maybe_paths or self.paths,

58 found_by = maybe_found_by or self.found_by

59 )

61 @property

62 def label(self) -> str:

63 """the first label for the mention"""

64 return self.labels[0]

66 @property

67 def start(self) -> int:

68 """index of the first token in the mention"""

69 return self.token_interval.start

71 @property

72 def end(self) -> int:

73 """one after the last token in the mention"""

74 return self.token_interval.end

76 @property

77 def sentence_obj(self) -> Sentence:

78 return self.document.sentences[self.sentence_index]

80 @property

81 def sentenceObj(self) -> Sentence:

82 self.sentence_obj

84 @property

85 def start_offset(self) -> int:

86 """character offset of the mention beginning"""

87 return self.sentence_obj.start_offsets[self.start]

89 @property

90 def startOffset(self) -> int:

91 """character offset of the mention beginning"""

92 return self.start_offset

94 @property

95 def char_start_offset(self) -> int:

96 """character offset of the mention beginning"""

97 return self.start_offset

99 @property

100 def end_offset(self) -> int:

101 """character offset of the mention end"""

102 return self.sentence_obj.end_offsets[self.end - 1]

103

104 @property

105 def endOffset(self) -> int:

106 """character offset of the mention end"""

107 return self.end_offset

108

109 @property

110 def char_end_offset(self) -> int:

111 """character offset of the mention end"""

112 return self.end_offset

113

114 @property

115 def is_valid(self) -> bool:

116 """returns true if this is a valid mention"""

117 return True

118

119 @property

120 def isValid(self) -> bool:

121 """returns true if this is a valid mention"""

122 return self.is_valid

123

124 def matches(self, label_or_pattern: typing.Union[str, re.Pattern]) -> bool:

125 """returns true if `label_or_pattern` matches any of the mention labels"""

126 if isinstance(label_or_pattern, str):

127 return label_or_pattern in self.labels

128 elif isinstance(label_or_pattern, re.Pattern):

129 patt = label_or_pattern

130 return True if any(re.match(patt, lbl) != None for lbl in self.labels) else False

131 return False

132

133 @property

134 def raw(self) -> list[str]:

135 """returns all raw (original, no processing applied) tokens in mention"""

136 return self.sentence_obj.raw[self.start:self.end]

137

138 @property

139 def words(self) -> list[str]:

140 """returns all tokens in mention"""

141 return self.sentence_obj.words[self.start:self.end]

142

143 @property

144 def tags(self) -> typing.Optional[list[str]]:

145 """returns all tags in mention"""

146 if self.sentence_obj.tags:

147 return self.sentence_obj.tags[self.start:self.end]

148 return None

149

150 @property

151 def lemmas(self) -> typing.Optional[list[str]]:

152 """returns all lemmas in mention"""

153 if self.sentence_obj.lemmas:

154 return self.sentence_obj.lemmas[self.start:self.end]

155 return None

156

157 @property

158 def entities(self) -> typing.Optional[list[str]]:

159 """returns all entities in mention"""

160 if self.sentence_obj.entities:

161 return self.sentence_obj.entities[self.start:self.end]

162 return None

163

164 @property

165 def norms(self) -> typing.Optional[list[str]]:

166 """returns all norms in mention"""

167 if self.sentence_obj.norms:

168 return self.sentence_obj.norms[self.start:self.end]

169 return None

170

171 @property

172 def chunks(self) -> typing.Optional[list[str]]:

173 """returns all chunks in mention"""

174 if self.sentence_obj.chunks:

175 return self.sentence_obj.chunks[self.start:self.end]

176 return None

177

178 @property

179 def text(self) -> str:

180 """returns a string that contains the mention"""

181 _text = self.document.text

182 if _text is not None:

183 return _text[self.start_offset:self.end_offset]

184 # FIXME: this can be improved

185 else:

186 return " ".join(self.raw[self.start:self.end])

187

188 # /** returns a string that contains the mention */

189 # def text: String = document.text match {

190 # case Some(txt) => txt.slice(startOffset, endOffset)

191 # case None =>

192 # // try to reconstruct the sentence using the character offsets

193 # val bits = raw.head +: tokenInterval.tail.map { i =>

194 # val spaces = " " * (sentenceObj.startOffsets(i) - sentenceObj.endOffsets(i - 1))

195 # val rawWord = sentenceObj.raw(i)

196 # spaces + rawWord

197 # }

198 # bits.mkString

199 # }

200

201 # /** returns all syntactic heads */

202 # def synHeads: Seq[Int] = sentenceObj.dependencies match {

203 # case Some(deps) => DependencyUtils.findHeads(tokenInterval, deps)

204 # case None => Nil

205 # }

206

207 # /** returns the minimum distance to a root node for dependencies within the token interval */

208 # def distToRootOpt: Option[Int] = sentenceObj.dependencies.flatMap { deps =>

209 # // Note that

210 # // Double.MaxValue.toInt == Int.MaxValue

211 # // Double.PositiveInfinity.toInt == Int.MaxValue

212 # DependencyUtils.distToRootOpt(tokenInterval, deps).map(_.toInt)

213 # }

214

215 # /** returns the syntactic head of `mention` */

216 # def synHead: Option[Int] = synHeads.lastOption

217

218 # /** returns head token */

219 # def synHeadWord: Option[String] = synHead.map(i => sentenceObj.words(i))

220

221 # /** returns head pos tag */

222 # def synHeadTag: Option[String] = synHead.flatMap(i => sentenceObj.tags.map(_(i)))

223

224 # /** returns head lemma */

225 # def synHeadLemma: Option[String] = synHead.flatMap(i => sentenceObj.lemmas.map(_(i)))

226

227 # /** returns all semantic heads */

228 # def semHeads: Seq[Int] = DependencyUtils.findHeadsStrict(tokenInterval, sentenceObj)

229

230 # /** returns the syntactic head of `mention` */

231 # def semHead: Option[Int] = semHeads.lastOption

232

233 # /** returns head token */

234 # def semHeadWord: Option[String] = semHead.map(i => sentenceObj.words(i))

235

236 # /** returns head pos tag */

237 # def semHeadTag: Option[String] = semHead.flatMap(i => sentenceObj.tags.map(_(i)))

238

239 # /** returns head lemma */

240 # def semHeadLemma: Option[String] = semHead.flatMap(i => sentenceObj.lemmas.map(_(i)))

241

242

243 # override def canEqual(a: Any) = a.isInstanceOf[Mention]

244

245 # override def equals(that: Any): Boolean = that match {

246 # case that: Mention => that.canEqual(this) && this.hashCode == that.hashCode

247 # case _ => false

248 # }

249

250 # def compare(that: Mention): Int = {

251 # require(this.document == that.document,

252 # "can't compare mentions if they belong to different documents")

253 # if (this.sentence < that.sentence) -1

254 # else if (this.sentence > that.sentence) 1

255 # else this.tokenInterval compare that.tokenInterval

256 # }

257

258 # def precedes(that: Mention): Boolean = this.compare(that) < 0

259

260# class Mention(BaseModel):

261

262# TBM: typing.ClassVar[str] = "TextBoundMention"

263# EM: typing.ClassVar[str] = "EventMention"

264# RM: typing.ClassVar[str] = "RelationMention"

265

266# """

267# A labeled span of text. Used to model textual mentions of events, relations, and entities.

268

269# Parameters

270# ----------

271# token_interval : Interval

272# The span of the Mention represented as an Interval.

273# sentence : int

274# The sentence index that contains the Mention.

275# document : Document

276# The Document in which the Mention was found.

277# foundBy : str

278# The Odin IE rule that produced this Mention.

279# label : str

280# The label most closely associated with this span. Usually the lowest hyponym of "labels".

281# labels: list

282# The list of labels associated with this span.

283# trigger: dict or None

284# dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).

285# arguments: dict or None

286# dict of JSON for Mention's arguments.

287# paths: dict or None

288# dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).

289# doc_id: str or None

290# the id of the document

291

292# Attributes

293# ----------

294# tokenInterval: processors.ds.Interval

295# An `Interval` encoding the `start` and `end` of the `Mention`.

296# start : int

297# The token index that starts the `Mention`.

298# end : int

299# The token index that marks the end of the Mention (exclusive).

300# sentenceObj : processors.ds.Sentence

301# Pointer to the `Sentence` instance containing the `Mention`.

302# characterStartOffset: int

303# The index of the character that starts the `Mention`.

304# characterEndOffset: int

305# The index of the character that ends the `Mention`.

306# type: Mention.TBM or Mention.EM or Mention.RM

307# The type of the `Mention`.

308

309# See Also

310# --------

311

312# [`Odin` manual](https://arxiv.org/abs/1509.07513)

313

314# Methods

315# -------

316# matches(label_pattern)

317# Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.

318

319# overlaps(other)

320# Test whether other (token index or Mention) overlaps with span of this Mention.

321

322# copy(**kwargs)

323# Copy constructor for this Mention.

324

325# words()

326# Words for this Mention's span.

327

328# tags()

329# Part of speech for this Mention's span.

330

331# lemmas()

332# Lemmas for this Mention's span.

333

334# _chunks()

335# chunk labels for this Mention's span.

336

337# _entities()

338# NE labels for this Mention's span.

339# """

340

341 # def __init__(self,

342 # token_interval,

343 # sentence,

344 # document,

345 # foundBy,

346 # label,

347 # labels=None,

348 # trigger=None,

349 # arguments=None,

350 # paths=None,

351 # keep=True,

352 # doc_id=None):

353

354 # NLPDatum.__init__(self)

355 # self.label = label

356 # self.labels = labels if labels else [self.label]

357 # self.tokenInterval = token_interval

358 # self.start = self.tokenInterval.start

359 # self.end = self.tokenInterval.end

360 # self.document = document

361 # self._doc_id = doc_id or hash(self.document)

362 # self.sentence = sentence

363 # if trigger:

364 # # NOTE: doc id is not stored for trigger's json,

365 # # as it is assumed to be contained in the same document as its parent

366 # trigger.update({"document": self._doc_id})

367 # self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())

368 # else:

369 # self.trigger = None

370 # # unpack args

371 # self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None

372 # self.paths = paths

373 # self.keep = keep

374 # self.foundBy = foundBy

375 # # other

376 # self.sentenceObj = self.document.sentences[self.sentence]

377 # self.text = " ".join(self.sentenceObj.words[self.start:self.end])

378 # # recover offsets

379 # self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]

380 # self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]

381 # # for later recovery

382 # self.id = None

383 # self.type = self._set_type()

384

385 # def __str__(self):

386 # return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))

387

388 # def __eq__(self, other):

389 # if isinstance(other, self.__class__):

390 # return self.__dict__ == other.__dict__

391 # else:

392 # return False

393

394 # def __ne__(self, other):

395 # return not self.__eq__(other)

396

397 # def __hash__(self):

398 # return hash(self.to_JSON())

399

400 # def startOffset(self):

401 # return self.sentenceObj.endOffsets[self.start]

402

403 # def endOffset(self):

404 # return self.sentenceObj.endOffsets[self.end -1]

405

406 # def words(self):

407 # return self.sentenceObj.words[self.start:self.end]

408

409 # def tags(self):

410 # return self.sentenceObj.tags[self.start:self.end]

411

412 # def lemmas(self):

413 # return self.sentenceObj.lemmas[self.start:self.end]

414

415 # def _chunks(self):

416 # return self.sentenceObj._chunks[self.start:self.end]

417

418 # def _entities(self):

419 # return self.sentenceObj._entities[self.start:self.end]

420

421 # def overlaps(self, other):

422 # """

423 # Checks for overlap.

424 # """

425 # if isinstance(other, int):

426 # return self.start <= other < self.end

427 # elif isinstance(other, Mention):

428 # # equiv. sentences + checks on start and end

429 # return (self.sentence.__hash__() == other.sentence.__hash__()) and \

430 # self.tokenInterval.overlaps(other.tokenInterval)

431 # else:

432 # return False

433

434

435

436 # def _arguments_to_JSON_dict(self):

437 # return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())

438

439 # def _paths_to_JSON_dict(self):

440 # return {role: paths.to_JSON_dict() for (role, paths) in self.paths}

441

442 # @staticmethod

443 # def load_from_JSON(mjson, docs_dict):

444 # # recover document

445 # doc_id = mjson["document"]

446 # doc = docs_dict[doc_id]

447 # labels = mjson["labels"]

448 # kwargs = {

449 # "label": mjson.get("label", labels[0]),

450 # "labels": labels,

451 # "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),

452 # "sentence": mjson["sentence"],

453 # "document": doc,

454 # "doc_id": doc_id,

455 # "trigger": mjson.get("trigger", None),

456 # "arguments": mjson.get("arguments", None),

457 # "paths": mjson.get("paths", None),

458 # "keep": mjson.get("keep", True),

459 # "foundBy": mjson["foundBy"]

460 # }

461 # m = Mention(**kwargs)

462 # # set IDs

463 # m.id = mjson["id"]

464 # m._doc_id = doc_id

465 # # set character offsets

466 # m.character_start_offset = mjson["characterStartOffset"]

467 # m.character_end_offset = mjson["characterEndOffset"]

468 # return m

469

470 # def _to_document_map(self):

471 # return {self._doc_id: self.document}

472

473 # def _set_type(self):

474 # # event mention

475 # if self.trigger != None:

476 # return Mention.EM

477 # # textbound mention

478 # elif self.trigger == None and self.arguments == None:

479 # return Mention.TBM

480 # else:

481 # return Mention.RM

482

483

484class TextBoundMention(Mention):

485

486 # override from Mention

487 arguments: typing.Optional[Mention.Arguments] = Field(default=None, description="A TextBoundMention has no arguments")

488 paths: typing.Optional[Mention.Paths] = Field(default=None, description="A TextBoundMention has no paths")

489

490class RelationMention(Mention):

491 # FIXME: ensure arguments dict is non-empt

492

493 # TODO: implement me

494 # see https://github.com/clulab/processors/blob/9f89ea7bf6ac551f77dbfdbb8eec9bf216711df4/main/src/main/scala/org/clulab/odin/Mention.scala

495 @property

496 def is_valid(self) -> bool:

497 """returns true if this is a valid mention"""

498 # args should all be from same sentence

499 raise NotImplementedError

500

501 # TODO: implement me

502 @property

503 def to_event_mention(trigger: TextBoundMention) -> "EventMention":

504 """"""

505 # check that trigger and self have same sent and doc

506 raise NotImplementedError

507

508 # TODO: implement me

509 def scatter(arg_name: str, size: int) -> list[RelationMention]:

510 raise NotImplementedError

511 # arguments

512 # .getOrElse(argName, Nil)

513 # .combinations(size)

514 # .map(args => this + (argName -> args))

515 # .toList

516

517 # TODO: implement me

518 # Create a new EventMention by removing a single argument

519 def __sub__(other: typing.Any) -> RelationMention:

520 raise NotImplementedError

521 #copy(arguments = this.arguments - argName)

522 # Create a new EventMention by removing a sequence of arguments

523 # def --(argNames: Seq[String]): EventMention =

524 # copy(arguments = this.arguments -- argNames)

525

526 # TODO: implement me

527 def __add__(other: typing.Any) -> RelationMention:

528 """Create a new RelationMention by adding a key, value pair to the arguments map"""

529 #def +(arg: (String, Seq[Mention])): RelationMention =

530 #copy(arguments = this.arguments + arg)

531 raise NotImplementedError

532

533class EventMention(Mention):

534 trigger: TextBoundMention = Field(description="")

535 arguments: Mention.Arguments = Field(default={}, description="A mapping of the EventMention's arguments (role -> list[Mention])")

536 paths: typing.Optional[Mention.Paths] = Field(default={}, description="Graph traversal leading to each argument")

537

538 def copy(

539 self,

540 maybe_trigger: typing.Optional[TextBoundMention] = None,

541 maybe_labels: typing.Optional[list[str]] = None,

542 maybe_token_interval: typing.Optional[Interval] = None,

543 maybe_sentence_index: typing.Optional[int] = None,

544 maybe_document: typing.Optional[Document] = None,

545 maybe_keep: typing.Optional[bool] = None,

546 maybe_arguments: typing.Optional[Mention.Arguments] = None,

547 maybe_paths: typing.Optional[Mention.Paths] = None,

548 maybe_found_by: typing.Optional[str] = None,

549 ) -> EventMention:

550 return EventMention(

551 trigger = maybe_trigger or self.trigger,

552 labels = maybe_labels or self.labels,

553 token_interval = maybe_token_interval or self.token_interval,

554 sentence_index = maybe_sentence_index or self.sentence_index,

555 document = maybe_document or self.document,

556 keep = maybe_keep or self.keep,

557 arguments = maybe_arguments or self.arguments,

558 paths = maybe_paths or self.paths,

559 found_by = maybe_found_by or self.found_by

560 )

561

562 # TODO: implement me

563 # see https://github.com/clulab/processors/blob/9f89ea7bf6ac551f77dbfdbb8eec9bf216711df4/main/src/main/scala/org/clulab/odin/Mention.scala#L323-L330

564 @property

565 def is_valid(self) -> bool:

566 """returns true if this is a valid mention"""

567 raise NotImplementedError

568

569 # TODO: implement me

570 def to_relation_mention(self) -> RelationMention:

571 raise NotImplementedError

572

573 # TODO: implement me

574 def scatter(arg_name: str, size: int) -> list[EventMention]:

575 raise NotImplementedError

576 # arguments

577 # .getOrElse(argName, Nil)

578 # .combinations(size)

579 # .map(args => this + (argName -> args))

580 # .toList

581

582 # TODO: implement me

583 # Create a new EventMention by removing a single argument

584 def __sub__(other: typing.Any) -> EventMention:

585 raise NotImplementedError

586 #copy(arguments = this.arguments - argName)

587 # Create a new EventMention by removing a sequence of arguments

588 # def --(argNames: Seq[String]): EventMention =

589 # copy(arguments = this.arguments -- argNames)

590

591 # TODO: implement me

592 def __add__(other: typing.Any) -> EventMention:

593 """Create a new EventMention by adding a key, value pair to the arguments map"""

594 #def +(arg: (String, Seq[Mention])): EventMention =

595 #copy(arguments = this.arguments + arg)

596 raise NotImplementedError

597

598class CrossSentenceMention(Mention):

599 anchor: Mention = Field(description="The mention serving as the anchor for this cross-sentence mention")

600 neighbor: Mention = Field(description="The second mention for this cross-sentence mention")

601

602 # FIXME: add check on arguments

603 #require(arguments.size == 2, "CrossSentenceMention must have exactly two arguments")

604 # assert anchor.document == neighbor.document

605 # assert anchor.sentence_obj != neighbor.sentence_obj