Coverage for python/lum/clu/odin/mention.py: 63%
141 statements
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-17 18:41 +0000
1# -*- coding: utf-8 -*-
2from __future__ import annotations
3from pydantic import BaseModel, Field
4from lum.clu.processors.document import Document
5from lum.clu.processors.sentence import Sentence
6from lum.clu.processors.interval import Interval
7from lum.clu.odin.synpath import SynPath
8import re
9import typing
12__all__ = ["Mention", "TextBoundMention", "RelationMention", "EventMention", "CrossSentenceMention"]
14# MentionTypes = typing.Union[TextBoundMention, EventMention, RelationMention, CrossSentenceMention]
16class Mention(BaseModel):
18 Paths: typing.ClassVar[typing.TypeAlias] = dict[str, dict["Mention", SynPath]]
20 Arguments: typing.ClassVar[typing.TypeAlias] = dict[str, list["Mention"]]
21 # FIXME: add validation that this is non-empty?
22 labels: list[str] = Field(description="A sequence of labels for this mention. The first label in the sequence is considered the default.")
23 # alias="tokenInterval",
24 # TODO: consider adding https://docs.pydantic.dev/latest/api/config/#pydantic.config.ConfigDict.populate_by_name
25 token_interval: Interval = Field(description="The interval of token indicess that form this mention.")
26 # alias="sentence",
27 sentence_index: int = Field(description="The index of the sentence where this mention occurs.")
29 document: Document = Field(description="The document where this mention occurs")
31 keep: bool = Field(default=True, description="Should we report this mention at the end?")
33 arguments: typing.Optional[Mention.Arguments] = Field(default=None, description="A map from argument name to a sequence of mentions. The value of the map is a sequence because there are events that can have several arguments with the same name. For example, in the biodomain, Binding may have several themes.")
35 paths: typing.Optional[Paths] = Field(default=None, description="Graph traversal leading to each argument")
36 # alias="foundBy"
37 found_by: str = Field(default="unknown", description="The name of the rule that produced this mention")
39 def copy(
40 self,
41 maybe_labels: typing.Optional[list[str]] = None,
42 maybe_token_interval: typing.Optional[Interval] = None,
43 maybe_sentence_index: typing.Optional[int] = None,
44 maybe_document: typing.Optional[Document] = None,
45 maybe_keep: typing.Optional[bool] = None,
46 maybe_arguments: typing.Optional[Mention.Arguments] = None,
47 maybe_paths: typing.Optional[Mention.Paths] = None,
48 maybe_found_by: typing.Optional[str] = None,
49 ) -> Mention:
50 return Mention(
51 labels = maybe_labels or self.labels,
52 token_interval = maybe_token_interval or self.token_interval,
53 sentence_index = maybe_sentence_index or self.sentence_index,
54 document = maybe_document or self.document,
55 keep = maybe_keep or self.keep,
56 arguments = maybe_arguments or self.arguments,
57 paths = maybe_paths or self.paths,
58 found_by = maybe_found_by or self.found_by
59 )
61 @property
62 def label(self) -> str:
63 """the first label for the mention"""
64 return self.labels[0]
66 @property
67 def start(self) -> int:
68 """index of the first token in the mention"""
69 return self.token_interval.start
71 @property
72 def end(self) -> int:
73 """one after the last token in the mention"""
74 return self.token_interval.end
76 @property
77 def sentence_obj(self) -> Sentence:
78 return self.document.sentences[self.sentence_index]
80 @property
81 def sentenceObj(self) -> Sentence:
82 self.sentence_obj
84 @property
85 def start_offset(self) -> int:
86 """character offset of the mention beginning"""
87 return self.sentence_obj.start_offsets[self.start]
89 @property
90 def startOffset(self) -> int:
91 """character offset of the mention beginning"""
92 return self.start_offset
94 @property
95 def char_start_offset(self) -> int:
96 """character offset of the mention beginning"""
97 return self.start_offset
99 @property
100 def end_offset(self) -> int:
101 """character offset of the mention end"""
102 return self.sentence_obj.end_offsets[self.end - 1]
104 @property
105 def endOffset(self) -> int:
106 """character offset of the mention end"""
107 return self.end_offset
109 @property
110 def char_end_offset(self) -> int:
111 """character offset of the mention end"""
112 return self.end_offset
114 @property
115 def is_valid(self) -> bool:
116 """returns true if this is a valid mention"""
117 return True
119 @property
120 def isValid(self) -> bool:
121 """returns true if this is a valid mention"""
122 return self.is_valid
124 def matches(self, label_or_pattern: typing.Union[str, re.Pattern]) -> bool:
125 """returns true if `label_or_pattern` matches any of the mention labels"""
126 if isinstance(label_or_pattern, str):
127 return label_or_pattern in self.labels
128 elif isinstance(label_or_pattern, re.Pattern):
129 patt = label_or_pattern
130 return True if any(re.match(patt, lbl) != None for lbl in self.labels) else False
131 return False
133 @property
134 def raw(self) -> list[str]:
135 """returns all raw (original, no processing applied) tokens in mention"""
136 return self.sentence_obj.raw[self.start:self.end]
138 @property
139 def words(self) -> list[str]:
140 """returns all tokens in mention"""
141 return self.sentence_obj.words[self.start:self.end]
143 @property
144 def tags(self) -> typing.Optional[list[str]]:
145 """returns all tags in mention"""
146 if self.sentence_obj.tags:
147 return self.sentence_obj.tags[self.start:self.end]
148 return None
150 @property
151 def lemmas(self) -> typing.Optional[list[str]]:
152 """returns all lemmas in mention"""
153 if self.sentence_obj.lemmas:
154 return self.sentence_obj.lemmas[self.start:self.end]
155 return None
157 @property
158 def entities(self) -> typing.Optional[list[str]]:
159 """returns all entities in mention"""
160 if self.sentence_obj.entities:
161 return self.sentence_obj.entities[self.start:self.end]
162 return None
164 @property
165 def norms(self) -> typing.Optional[list[str]]:
166 """returns all norms in mention"""
167 if self.sentence_obj.norms:
168 return self.sentence_obj.norms[self.start:self.end]
169 return None
171 @property
172 def chunks(self) -> typing.Optional[list[str]]:
173 """returns all chunks in mention"""
174 if self.sentence_obj.chunks:
175 return self.sentence_obj.chunks[self.start:self.end]
176 return None
178 @property
179 def text(self) -> str:
180 """returns a string that contains the mention"""
181 _text = self.document.text
182 if _text is not None:
183 return _text[self.start_offset:self.end_offset]
184 # FIXME: this can be improved
185 else:
186 return " ".join(self.raw[self.start:self.end])
188 # /** returns a string that contains the mention */
189 # def text: String = document.text match {
190 # case Some(txt) => txt.slice(startOffset, endOffset)
191 # case None =>
192 # // try to reconstruct the sentence using the character offsets
193 # val bits = raw.head +: tokenInterval.tail.map { i =>
194 # val spaces = " " * (sentenceObj.startOffsets(i) - sentenceObj.endOffsets(i - 1))
195 # val rawWord = sentenceObj.raw(i)
196 # spaces + rawWord
197 # }
198 # bits.mkString
199 # }
201 # /** returns all syntactic heads */
202 # def synHeads: Seq[Int] = sentenceObj.dependencies match {
203 # case Some(deps) => DependencyUtils.findHeads(tokenInterval, deps)
204 # case None => Nil
205 # }
207 # /** returns the minimum distance to a root node for dependencies within the token interval */
208 # def distToRootOpt: Option[Int] = sentenceObj.dependencies.flatMap { deps =>
209 # // Note that
210 # // Double.MaxValue.toInt == Int.MaxValue
211 # // Double.PositiveInfinity.toInt == Int.MaxValue
212 # DependencyUtils.distToRootOpt(tokenInterval, deps).map(_.toInt)
213 # }
215 # /** returns the syntactic head of `mention` */
216 # def synHead: Option[Int] = synHeads.lastOption
218 # /** returns head token */
219 # def synHeadWord: Option[String] = synHead.map(i => sentenceObj.words(i))
221 # /** returns head pos tag */
222 # def synHeadTag: Option[String] = synHead.flatMap(i => sentenceObj.tags.map(_(i)))
224 # /** returns head lemma */
225 # def synHeadLemma: Option[String] = synHead.flatMap(i => sentenceObj.lemmas.map(_(i)))
227 # /** returns all semantic heads */
228 # def semHeads: Seq[Int] = DependencyUtils.findHeadsStrict(tokenInterval, sentenceObj)
230 # /** returns the syntactic head of `mention` */
231 # def semHead: Option[Int] = semHeads.lastOption
233 # /** returns head token */
234 # def semHeadWord: Option[String] = semHead.map(i => sentenceObj.words(i))
236 # /** returns head pos tag */
237 # def semHeadTag: Option[String] = semHead.flatMap(i => sentenceObj.tags.map(_(i)))
239 # /** returns head lemma */
240 # def semHeadLemma: Option[String] = semHead.flatMap(i => sentenceObj.lemmas.map(_(i)))
243 # override def canEqual(a: Any) = a.isInstanceOf[Mention]
245 # override def equals(that: Any): Boolean = that match {
246 # case that: Mention => that.canEqual(this) && this.hashCode == that.hashCode
247 # case _ => false
248 # }
250 # def compare(that: Mention): Int = {
251 # require(this.document == that.document,
252 # "can't compare mentions if they belong to different documents")
253 # if (this.sentence < that.sentence) -1
254 # else if (this.sentence > that.sentence) 1
255 # else this.tokenInterval compare that.tokenInterval
256 # }
258 # def precedes(that: Mention): Boolean = this.compare(that) < 0
260# class Mention(BaseModel):
262# TBM: typing.ClassVar[str] = "TextBoundMention"
263# EM: typing.ClassVar[str] = "EventMention"
264# RM: typing.ClassVar[str] = "RelationMention"
266# """
267# A labeled span of text. Used to model textual mentions of events, relations, and entities.
269# Parameters
270# ----------
271# token_interval : Interval
272# The span of the Mention represented as an Interval.
273# sentence : int
274# The sentence index that contains the Mention.
275# document : Document
276# The Document in which the Mention was found.
277# foundBy : str
278# The Odin IE rule that produced this Mention.
279# label : str
280# The label most closely associated with this span. Usually the lowest hyponym of "labels".
281# labels: list
282# The list of labels associated with this span.
283# trigger: dict or None
284# dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
285# arguments: dict or None
286# dict of JSON for Mention's arguments.
287# paths: dict or None
288# dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
289# doc_id: str or None
290# the id of the document
292# Attributes
293# ----------
294# tokenInterval: processors.ds.Interval
295# An `Interval` encoding the `start` and `end` of the `Mention`.
296# start : int
297# The token index that starts the `Mention`.
298# end : int
299# The token index that marks the end of the Mention (exclusive).
300# sentenceObj : processors.ds.Sentence
301# Pointer to the `Sentence` instance containing the `Mention`.
302# characterStartOffset: int
303# The index of the character that starts the `Mention`.
304# characterEndOffset: int
305# The index of the character that ends the `Mention`.
306# type: Mention.TBM or Mention.EM or Mention.RM
307# The type of the `Mention`.
309# See Also
310# --------
312# [`Odin` manual](https://arxiv.org/abs/1509.07513)
314# Methods
315# -------
316# matches(label_pattern)
317# Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
319# overlaps(other)
320# Test whether other (token index or Mention) overlaps with span of this Mention.
322# copy(**kwargs)
323# Copy constructor for this Mention.
325# words()
326# Words for this Mention's span.
328# tags()
329# Part of speech for this Mention's span.
331# lemmas()
332# Lemmas for this Mention's span.
334# _chunks()
335# chunk labels for this Mention's span.
337# _entities()
338# NE labels for this Mention's span.
339# """
341 # def __init__(self,
342 # token_interval,
343 # sentence,
344 # document,
345 # foundBy,
346 # label,
347 # labels=None,
348 # trigger=None,
349 # arguments=None,
350 # paths=None,
351 # keep=True,
352 # doc_id=None):
354 # NLPDatum.__init__(self)
355 # self.label = label
356 # self.labels = labels if labels else [self.label]
357 # self.tokenInterval = token_interval
358 # self.start = self.tokenInterval.start
359 # self.end = self.tokenInterval.end
360 # self.document = document
361 # self._doc_id = doc_id or hash(self.document)
362 # self.sentence = sentence
363 # if trigger:
364 # # NOTE: doc id is not stored for trigger's json,
365 # # as it is assumed to be contained in the same document as its parent
366 # trigger.update({"document": self._doc_id})
367 # self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
368 # else:
369 # self.trigger = None
370 # # unpack args
371 # self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
372 # self.paths = paths
373 # self.keep = keep
374 # self.foundBy = foundBy
375 # # other
376 # self.sentenceObj = self.document.sentences[self.sentence]
377 # self.text = " ".join(self.sentenceObj.words[self.start:self.end])
378 # # recover offsets
379 # self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
380 # self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
381 # # for later recovery
382 # self.id = None
383 # self.type = self._set_type()
385 # def __str__(self):
386 # return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))
388 # def __eq__(self, other):
389 # if isinstance(other, self.__class__):
390 # return self.__dict__ == other.__dict__
391 # else:
392 # return False
394 # def __ne__(self, other):
395 # return not self.__eq__(other)
397 # def __hash__(self):
398 # return hash(self.to_JSON())
400 # def startOffset(self):
401 # return self.sentenceObj.endOffsets[self.start]
403 # def endOffset(self):
404 # return self.sentenceObj.endOffsets[self.end -1]
406 # def words(self):
407 # return self.sentenceObj.words[self.start:self.end]
409 # def tags(self):
410 # return self.sentenceObj.tags[self.start:self.end]
412 # def lemmas(self):
413 # return self.sentenceObj.lemmas[self.start:self.end]
415 # def _chunks(self):
416 # return self.sentenceObj._chunks[self.start:self.end]
418 # def _entities(self):
419 # return self.sentenceObj._entities[self.start:self.end]
421 # def overlaps(self, other):
422 # """
423 # Checks for overlap.
424 # """
425 # if isinstance(other, int):
426 # return self.start <= other < self.end
427 # elif isinstance(other, Mention):
428 # # equiv. sentences + checks on start and end
429 # return (self.sentence.__hash__() == other.sentence.__hash__()) and \
430 # self.tokenInterval.overlaps(other.tokenInterval)
431 # else:
432 # return False
436 # def _arguments_to_JSON_dict(self):
437 # return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())
439 # def _paths_to_JSON_dict(self):
440 # return {role: paths.to_JSON_dict() for (role, paths) in self.paths}
442 # @staticmethod
443 # def load_from_JSON(mjson, docs_dict):
444 # # recover document
445 # doc_id = mjson["document"]
446 # doc = docs_dict[doc_id]
447 # labels = mjson["labels"]
448 # kwargs = {
449 # "label": mjson.get("label", labels[0]),
450 # "labels": labels,
451 # "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
452 # "sentence": mjson["sentence"],
453 # "document": doc,
454 # "doc_id": doc_id,
455 # "trigger": mjson.get("trigger", None),
456 # "arguments": mjson.get("arguments", None),
457 # "paths": mjson.get("paths", None),
458 # "keep": mjson.get("keep", True),
459 # "foundBy": mjson["foundBy"]
460 # }
461 # m = Mention(**kwargs)
462 # # set IDs
463 # m.id = mjson["id"]
464 # m._doc_id = doc_id
465 # # set character offsets
466 # m.character_start_offset = mjson["characterStartOffset"]
467 # m.character_end_offset = mjson["characterEndOffset"]
468 # return m
470 # def _to_document_map(self):
471 # return {self._doc_id: self.document}
473 # def _set_type(self):
474 # # event mention
475 # if self.trigger != None:
476 # return Mention.EM
477 # # textbound mention
478 # elif self.trigger == None and self.arguments == None:
479 # return Mention.TBM
480 # else:
481 # return Mention.RM
484class TextBoundMention(Mention):
486 # override from Mention
487 arguments: typing.Optional[Mention.Arguments] = Field(default=None, description="A TextBoundMention has no arguments")
488 paths: typing.Optional[Mention.Paths] = Field(default=None, description="A TextBoundMention has no paths")
490class RelationMention(Mention):
491 # FIXME: ensure arguments dict is non-empt
493 # TODO: implement me
494 # see https://github.com/clulab/processors/blob/9f89ea7bf6ac551f77dbfdbb8eec9bf216711df4/main/src/main/scala/org/clulab/odin/Mention.scala
495 @property
496 def is_valid(self) -> bool:
497 """returns true if this is a valid mention"""
498 # args should all be from same sentence
499 raise NotImplementedError
501 # TODO: implement me
502 @property
503 def to_event_mention(trigger: TextBoundMention) -> "EventMention":
504 """"""
505 # check that trigger and self have same sent and doc
506 raise NotImplementedError
508 # TODO: implement me
509 def scatter(arg_name: str, size: int) -> list[RelationMention]:
510 raise NotImplementedError
511 # arguments
512 # .getOrElse(argName, Nil)
513 # .combinations(size)
514 # .map(args => this + (argName -> args))
515 # .toList
517 # TODO: implement me
518 # Create a new EventMention by removing a single argument
519 def __sub__(other: typing.Any) -> RelationMention:
520 raise NotImplementedError
521 #copy(arguments = this.arguments - argName)
522 # Create a new EventMention by removing a sequence of arguments
523 # def --(argNames: Seq[String]): EventMention =
524 # copy(arguments = this.arguments -- argNames)
526 # TODO: implement me
527 def __add__(other: typing.Any) -> RelationMention:
528 """Create a new RelationMention by adding a key, value pair to the arguments map"""
529 #def +(arg: (String, Seq[Mention])): RelationMention =
530 #copy(arguments = this.arguments + arg)
531 raise NotImplementedError
533class EventMention(Mention):
534 trigger: TextBoundMention = Field(description="")
535 arguments: Mention.Arguments = Field(default={}, description="A mapping of the EventMention's arguments (role -> list[Mention])")
536 paths: typing.Optional[Mention.Paths] = Field(default={}, description="Graph traversal leading to each argument")
538 def copy(
539 self,
540 maybe_trigger: typing.Optional[TextBoundMention] = None,
541 maybe_labels: typing.Optional[list[str]] = None,
542 maybe_token_interval: typing.Optional[Interval] = None,
543 maybe_sentence_index: typing.Optional[int] = None,
544 maybe_document: typing.Optional[Document] = None,
545 maybe_keep: typing.Optional[bool] = None,
546 maybe_arguments: typing.Optional[Mention.Arguments] = None,
547 maybe_paths: typing.Optional[Mention.Paths] = None,
548 maybe_found_by: typing.Optional[str] = None,
549 ) -> EventMention:
550 return EventMention(
551 trigger = maybe_trigger or self.trigger,
552 labels = maybe_labels or self.labels,
553 token_interval = maybe_token_interval or self.token_interval,
554 sentence_index = maybe_sentence_index or self.sentence_index,
555 document = maybe_document or self.document,
556 keep = maybe_keep or self.keep,
557 arguments = maybe_arguments or self.arguments,
558 paths = maybe_paths or self.paths,
559 found_by = maybe_found_by or self.found_by
560 )
562 # TODO: implement me
563 # see https://github.com/clulab/processors/blob/9f89ea7bf6ac551f77dbfdbb8eec9bf216711df4/main/src/main/scala/org/clulab/odin/Mention.scala#L323-L330
564 @property
565 def is_valid(self) -> bool:
566 """returns true if this is a valid mention"""
567 raise NotImplementedError
569 # TODO: implement me
570 def to_relation_mention(self) -> RelationMention:
571 raise NotImplementedError
573 # TODO: implement me
574 def scatter(arg_name: str, size: int) -> list[EventMention]:
575 raise NotImplementedError
576 # arguments
577 # .getOrElse(argName, Nil)
578 # .combinations(size)
579 # .map(args => this + (argName -> args))
580 # .toList
582 # TODO: implement me
583 # Create a new EventMention by removing a single argument
584 def __sub__(other: typing.Any) -> EventMention:
585 raise NotImplementedError
586 #copy(arguments = this.arguments - argName)
587 # Create a new EventMention by removing a sequence of arguments
588 # def --(argNames: Seq[String]): EventMention =
589 # copy(arguments = this.arguments -- argNames)
591 # TODO: implement me
592 def __add__(other: typing.Any) -> EventMention:
593 """Create a new EventMention by adding a key, value pair to the arguments map"""
594 #def +(arg: (String, Seq[Mention])): EventMention =
595 #copy(arguments = this.arguments + arg)
596 raise NotImplementedError
598class CrossSentenceMention(Mention):
599 anchor: Mention = Field(description="The mention serving as the anchor for this cross-sentence mention")
600 neighbor: Mention = Field(description="The second mention for this cross-sentence mention")
602 # FIXME: add check on arguments
603 #require(arguments.size == 2, "CrossSentenceMention must have exactly two arguments")
604 # assert anchor.document == neighbor.document
605 # assert anchor.sentence_obj != neighbor.sentence_obj