Module lum.clu.processors.document
Expand source code
from __future__ import annotations
from pydantic import BaseModel, Field, ConfigDict
from lum.clu.processors.sentence import Sentence
from lum.clu.processors.utils import Labels
import typing
class Document(BaseModel):
"""
Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
"""
model_config = ConfigDict(populate_by_name=True)
id: typing.Optional[str] = Field(default=None, description="A unique ID for the `Document`.")
text: typing.Optional[str] = Field(default=None, description=" The text of the `Document`.")
sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.")
@staticmethod
def merge_documents(docs: list[Document]) -> Document:
"""Merges two or more Documents into a single Document."""
text = ""
sentences = []
offset = 0
for doc in docs:
for old in doc.sentences:
s = Sentence(
text = old.text,
raw = old.raw,
words = old.words,
startOffsets = [i + offset for i in old.start_offsets],
endOffsets = [i + offset for i in old.end_offsets],
tags= old.tags,
lemmas = old.lemmas,
norms = old.norms,
chunks = old.chunks,
entities = old.entities,
graphs = old.graphs
)
sentences.append(s)
if doc.text:
text += doc.text
offset += len(doc.text)
return Document(
id = docs[0].id,
text=text if len(text) > 0 else None,
sentences=sentences
)
# size : int
# The number of `sentences`.
# words : [str]
# A list of the `Document`'s tokens.
# tags : [str]
# A list of the `Document`'s tokens represented using part of speech (PoS) tags.
# lemmas : [str]
# A list of the `Document`'s tokens represented using lemmas.
# _entities : [str]
# A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.
# nes : dict
# A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.
# bag_of_labeled_deps : [str]
# The labeled dependencies from all sentences in the `Document`.
# bag_of_unlabeled_deps : [str]
# The unlabeled dependencies from all sentences in the `Document`.
# text : str or None
# The original text of the `Document`.
# Methods
# -------
# bag_of_labeled_dependencies_using(form)
# Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
# bag_of_unlabeled_dependencies_using(form)
# Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
# self.nes = merge_entity_dicts = self._merge_ne_dicts()
# self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
# self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
# def __hash__(self):
# return hash(self.to_JSON())
# def __unicode__(self):
# return self.text
# def __str__(self):
# return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
# def __eq__(self, other):
# if isinstance(other, self.__class__):
# return self.to_JSON() == other.to_JSON()
# else:
# return False
# def __ne__(self, other):
# return not self.__eq__(other)
# def bag_of_labeled_dependencies_using(self, form):
# return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
# def bag_of_unlabeled_dependencies_using(self, form):
# return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
# def _merge_ne_dicts(self):
# # Get the set of all NE labels found in the Doc's sentences
# entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
# # Do we have any labels?
# if entity_labels == None:
# return None
# # If we have labels, consolidate the NEs under the appropriate label
# else:
# nes_dict = dict()
# for e in entity_labels:
# entities = []
# for s in self.sentences:
# entities += s.nes[e]
# nes_dict[e] = entities
# return nes_dict
Classes
class Document (**data: Any)
-
Storage class for annotated text. Based on
org.clulab.processors.Document
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError
][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.self
is explicitly positional-only to allowself
as a field name.Expand source code
class Document(BaseModel): """ Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala) """ model_config = ConfigDict(populate_by_name=True) id: typing.Optional[str] = Field(default=None, description="A unique ID for the `Document`.") text: typing.Optional[str] = Field(default=None, description=" The text of the `Document`.") sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.") @staticmethod def merge_documents(docs: list[Document]) -> Document: """Merges two or more Documents into a single Document.""" text = "" sentences = [] offset = 0 for doc in docs: for old in doc.sentences: s = Sentence( text = old.text, raw = old.raw, words = old.words, startOffsets = [i + offset for i in old.start_offsets], endOffsets = [i + offset for i in old.end_offsets], tags= old.tags, lemmas = old.lemmas, norms = old.norms, chunks = old.chunks, entities = old.entities, graphs = old.graphs ) sentences.append(s) if doc.text: text += doc.text offset += len(doc.text) return Document( id = docs[0].id, text=text if len(text) > 0 else None, sentences=sentences ) # size : int # The number of `sentences`. # words : [str] # A list of the `Document`'s tokens. # tags : [str] # A list of the `Document`'s tokens represented using part of speech (PoS) tags. # lemmas : [str] # A list of the `Document`'s tokens represented using lemmas. # _entities : [str] # A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels. # nes : dict # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans. # bag_of_labeled_deps : [str] # The labeled dependencies from all sentences in the `Document`. # bag_of_unlabeled_deps : [str] # The unlabeled dependencies from all sentences in the `Document`. # text : str or None # The original text of the `Document`. # Methods # ------- # bag_of_labeled_dependencies_using(form) # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation. # bag_of_unlabeled_dependencies_using(form) # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation. # self.nes = merge_entity_dicts = self._merge_ne_dicts() # self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences])) # self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences])) # def __hash__(self): # return hash(self.to_JSON()) # def __unicode__(self): # return self.text # def __str__(self): # return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s") # def __eq__(self, other): # if isinstance(other, self.__class__): # return self.to_JSON() == other.to_JSON() # else: # return False # def __ne__(self, other): # return not self.__eq__(other) # def bag_of_labeled_dependencies_using(self, form): # return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences])) # def bag_of_unlabeled_dependencies_using(self, form): # return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences])) # def _merge_ne_dicts(self): # # Get the set of all NE labels found in the Doc's sentences # entity_labels = set(chain(*[s.nes.keys() for s in self.sentences])) # # Do we have any labels? # if entity_labels == None: # return None # # If we have labels, consolidate the NEs under the appropriate label # else: # nes_dict = dict() # for e in entity_labels: # entities = [] # for s in self.sentences: # entities += s.nes[e] # nes_dict[e] = entities # return nes_dict
Ancestors
- pydantic.main.BaseModel
Class variables
var id : Optional[str]
var model_computed_fields
var model_config
var model_fields
var sentences : list[Sentence]
var text : Optional[str]
Static methods
def merge_documents(docs: list[Document]) ‑> Document
-
Merges two or more Documents into a single Document.
Expand source code
@staticmethod def merge_documents(docs: list[Document]) -> Document: """Merges two or more Documents into a single Document.""" text = "" sentences = [] offset = 0 for doc in docs: for old in doc.sentences: s = Sentence( text = old.text, raw = old.raw, words = old.words, startOffsets = [i + offset for i in old.start_offsets], endOffsets = [i + offset for i in old.end_offsets], tags= old.tags, lemmas = old.lemmas, norms = old.norms, chunks = old.chunks, entities = old.entities, graphs = old.graphs ) sentences.append(s) if doc.text: text += doc.text offset += len(doc.text) return Document( id = docs[0].id, text=text if len(text) > 0 else None, sentences=sentences )