Module lum.clu.processors.document

Expand source code
from __future__ import annotations
from pydantic import BaseModel, Field, ConfigDict
from lum.clu.processors.sentence import Sentence
from lum.clu.processors.utils import Labels
import typing


class Document(BaseModel):

    """
    Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
    """

    model_config = ConfigDict(populate_by_name=True)
    
    id: typing.Optional[str] = Field(default=None, description="A unique ID for the `Document`.")

    text: typing.Optional[str] = Field(default=None, description=" The text of the `Document`.")

    sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.")

    @staticmethod
    def merge_documents(docs: list[Document]) -> Document:
      """Merges two or more Documents into a single Document."""
      text = ""
      sentences = []
      offset = 0
      for doc in docs:
          for old in doc.sentences:
              s = Sentence(
                  text = old.text,
                  raw = old.raw,
                  words = old.words,
                  startOffsets = [i + offset for i in old.start_offsets],
                  endOffsets = [i + offset for i in old.end_offsets],
                  tags= old.tags,
                  lemmas = old.lemmas,
                  norms = old.norms,
                  chunks = old.chunks,
                  entities = old.entities,
                  graphs = old.graphs
              )
              sentences.append(s)
          if doc.text:
              text += doc.text
              offset += len(doc.text)

      return Document(
          id = docs[0].id,
          text=text if len(text) > 0 else None,
          sentences=sentences
      )

    # size : int
    #     The number of `sentences`.

    # words : [str]
    #     A list of the `Document`'s tokens.

    # tags : [str]
    #     A list of the `Document`'s tokens represented using part of speech (PoS) tags.

    # lemmas : [str]
    #     A list of the `Document`'s tokens represented using lemmas.

    # _entities : [str]
    #     A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.

    # nes : dict
    #     A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.

    # bag_of_labeled_deps : [str]
    #     The labeled dependencies from all sentences in the `Document`.

    # bag_of_unlabeled_deps : [str]
    #     The unlabeled dependencies from all sentences in the `Document`.

    # text : str or None
    #     The original text of the `Document`.

    # Methods
    # -------
    # bag_of_labeled_dependencies_using(form)
    #     Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.

    # bag_of_unlabeled_dependencies_using(form)
    #     Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.


    # self.nes = merge_entity_dicts = self._merge_ne_dicts()
    # self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
    # self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))

    # def __hash__(self):
    #     return hash(self.to_JSON())

    # def __unicode__(self):
    #     return self.text

    # def __str__(self):
    #     return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")

    # def __eq__(self, other):
    #     if isinstance(other, self.__class__):
    #         return self.to_JSON() == other.to_JSON()
    #     else:
    #         return False

    # def __ne__(self, other):
    #     return not self.__eq__(other)

    # def bag_of_labeled_dependencies_using(self, form):
    #     return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))

    # def bag_of_unlabeled_dependencies_using(self, form):
    #     return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))

    # def _merge_ne_dicts(self):
    #     # Get the set of all NE labels found in the Doc's sentences
    #     entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
    #     # Do we have any labels?
    #     if entity_labels == None:
    #         return None
    #     # If we have labels, consolidate the NEs under the appropriate label
    #     else:
    #         nes_dict = dict()
    #         for e in entity_labels:
    #             entities = []
    #             for s in self.sentences:
    #                 entities += s.nes[e]
    #             nes_dict[e] = entities
    #         return nes_dict

Classes

class Document (**data: Any)

Storage class for annotated text. Based on org.clulab.processors.Document

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Expand source code
class Document(BaseModel):

    """
    Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
    """

    model_config = ConfigDict(populate_by_name=True)
    
    id: typing.Optional[str] = Field(default=None, description="A unique ID for the `Document`.")

    text: typing.Optional[str] = Field(default=None, description=" The text of the `Document`.")

    sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.")

    @staticmethod
    def merge_documents(docs: list[Document]) -> Document:
      """Merges two or more Documents into a single Document."""
      text = ""
      sentences = []
      offset = 0
      for doc in docs:
          for old in doc.sentences:
              s = Sentence(
                  text = old.text,
                  raw = old.raw,
                  words = old.words,
                  startOffsets = [i + offset for i in old.start_offsets],
                  endOffsets = [i + offset for i in old.end_offsets],
                  tags= old.tags,
                  lemmas = old.lemmas,
                  norms = old.norms,
                  chunks = old.chunks,
                  entities = old.entities,
                  graphs = old.graphs
              )
              sentences.append(s)
          if doc.text:
              text += doc.text
              offset += len(doc.text)

      return Document(
          id = docs[0].id,
          text=text if len(text) > 0 else None,
          sentences=sentences
      )

    # size : int
    #     The number of `sentences`.

    # words : [str]
    #     A list of the `Document`'s tokens.

    # tags : [str]
    #     A list of the `Document`'s tokens represented using part of speech (PoS) tags.

    # lemmas : [str]
    #     A list of the `Document`'s tokens represented using lemmas.

    # _entities : [str]
    #     A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.

    # nes : dict
    #     A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.

    # bag_of_labeled_deps : [str]
    #     The labeled dependencies from all sentences in the `Document`.

    # bag_of_unlabeled_deps : [str]
    #     The unlabeled dependencies from all sentences in the `Document`.

    # text : str or None
    #     The original text of the `Document`.

    # Methods
    # -------
    # bag_of_labeled_dependencies_using(form)
    #     Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.

    # bag_of_unlabeled_dependencies_using(form)
    #     Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.


    # self.nes = merge_entity_dicts = self._merge_ne_dicts()
    # self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
    # self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))

    # def __hash__(self):
    #     return hash(self.to_JSON())

    # def __unicode__(self):
    #     return self.text

    # def __str__(self):
    #     return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")

    # def __eq__(self, other):
    #     if isinstance(other, self.__class__):
    #         return self.to_JSON() == other.to_JSON()
    #     else:
    #         return False

    # def __ne__(self, other):
    #     return not self.__eq__(other)

    # def bag_of_labeled_dependencies_using(self, form):
    #     return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))

    # def bag_of_unlabeled_dependencies_using(self, form):
    #     return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))

    # def _merge_ne_dicts(self):
    #     # Get the set of all NE labels found in the Doc's sentences
    #     entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
    #     # Do we have any labels?
    #     if entity_labels == None:
    #         return None
    #     # If we have labels, consolidate the NEs under the appropriate label
    #     else:
    #         nes_dict = dict()
    #         for e in entity_labels:
    #             entities = []
    #             for s in self.sentences:
    #                 entities += s.nes[e]
    #             nes_dict[e] = entities
    #         return nes_dict

Ancestors

  • pydantic.main.BaseModel

Class variables

var id : Optional[str]
var model_computed_fields
var model_config
var model_fields
var sentences : list[Sentence]
var text : Optional[str]

Static methods

def merge_documents(docs: list[Document]) ‑> Document

Merges two or more Documents into a single Document.

Expand source code
@staticmethod
def merge_documents(docs: list[Document]) -> Document:
  """Merges two or more Documents into a single Document."""
  text = ""
  sentences = []
  offset = 0
  for doc in docs:
      for old in doc.sentences:
          s = Sentence(
              text = old.text,
              raw = old.raw,
              words = old.words,
              startOffsets = [i + offset for i in old.start_offsets],
              endOffsets = [i + offset for i in old.end_offsets],
              tags= old.tags,
              lemmas = old.lemmas,
              norms = old.norms,
              chunks = old.chunks,
              entities = old.entities,
              graphs = old.graphs
          )
          sentences.append(s)
      if doc.text:
          text += doc.text
          offset += len(doc.text)

  return Document(
      id = docs[0].id,
      text=text if len(text) > 0 else None,
      sentences=sentences
  )