Module lum.clu.processors.sentence
Expand source code
from __future__ import annotations
from pydantic import BaseModel, ConfigDict, Field, model_validator
from lum.clu.processors.directed_graph import DirectedGraph
from lum.clu.processors.utils import Labels
import typing
__all__ = ["Sentence"]
class Sentence(BaseModel):
UNKNOWN: typing.ClassVar[str] = Labels.UNKNOWN
# the O in IOB notation
O: typing.ClassVar[str] = Labels.O
"""
Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)
"""
model_config = ConfigDict(populate_by_name=True)
text: typing.Optional[str] = Field(default=None, description=" The text of the `Sentence`.", exclude=True)
raw: list[str] = Field(description="Raw tokens in this sentence; these are expected to match the original text")
words: list[str] = Field(description="A list of the `Sentence`'s tokens.")
start_offsets: list[int] = Field(alias="startOffsets", description="The character offsets starting each token (inclusive).")
end_offsets: list[int] = Field(alias="endOffsets", description="The character offsets marking the end of each token (exclusive).")
tags: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.")
lemmas: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using lemmas.")
norms: typing.Optional[list[str]] = Field(default=None, description="Normalized values of named/numeric entities, such as dates.")
chunks: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).")
entities: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.")
graphs: dict[str, DirectedGraph] = Field(description="A dictionary (str -> `lum.clu.processors.doc.DirectedGraph`) mapping the graph type/name to a `lum.clu.processors.doc.DirectedGraph`.")
@model_validator(mode="before")
@classmethod
def raw_or_words(cls, data: typing.Any) -> typing.Any:
"""if `raw` is not present, use `words` in its place."""
if isinstance(data, dict):
words = data.get("words", None)
raw = data.get("raw", None)
if raw is None:
data["raw"] = words
return data
# length : int
# The number of tokens in the `Sentence`
# basic_dependencies : lum.clu.processors.doc.DirectedGraph
# A `lum.clu.processors.doc.DirectedGraph` using basic Stanford dependencies.
# collapsed_dependencies : lum.clu.processors.doc.DirectedGraph
# A `lum.clu.processors.doc.DirectedGraph` using collapsed Stanford dependencies.
# dependencies : lum.clu.processors.doc.DirectedGraph
# A pointer to the prefered syntactic dependency graph type for this `Sentence`.
# _entities : [str]
# The IOB-style Named Entity (NE) labels corresponding to each token.
# _chunks : [str]
# The IOB-style chunk labels corresponding to each token.
# nes : dict
# A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`
# phrases : dict
# A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`
# Methods
# -------
# bag_of_labeled_dependencies_using(form)
# Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
# bag_of_unlabeled_dependencies_using(form)
# Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
@property
def length(self) -> int:
return len(self.raw)
# self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)
# self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)
# self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies
# # IOB tokens -> {label: [phrase 1, ..., phrase n]}
# self.nes = self._handle_iob(self._entities)
# self.phrases = self._handle_iob(self._chunks)
# def __eq__(self, other):
# if isinstance(other, self.__class__):
# return self.to_JSON() == other.to_JSON()
# else:
# return False
# def __ne__(self, other):
# return not self.__eq__(other)
# def __hash__(self):
# return hash(self.to_JSON(pretty=False))
# def deduplication_hash(self):
# """
# Generates a deduplication hash for the sentence
# """
# return hashlib.sha256(self.to_JSON(pretty=False).encode()).hexdigest()
# def _get_tokens(self, form):
# f = form.lower()
# if f == "words":
# tokens = self.words
# elif f == "tags":
# tokens = self.tags
# elif f == "lemmas":
# tokens = self.lemmas
# elif f == "entities":
# tokens = self.nes
# elif f == "index":
# tokens = list(range(self.length))
# # unrecognized form
# else:
# raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""")
# return tokens
# def _set_toks(self, toks):
# return toks if toks else [Sentence.UNKNOWN]*self.length
# def _handle_iob(self, iob):
# """
# Consolidates consecutive tokens in IOB notation under the appropriate label.
# Regexs control for bionlp annotator, which uses IOB notation.
# """
# entity_dict = defaultdict(list)
# # initialize to empty label
# current = Sentence.O
# start = None
# end = None
# for i, tok in enumerate(iob):
# # we don't have an I or O
# if tok == Sentence.O:
# # did we have an entity with the last token?
# current = re.sub('(B-|I-)','', str(current))
# if current == Sentence.O:
# continue
# else:
# # the last sequence has ended
# end = i
# # store the entity
# named_entity = ' '.join(self.words[start:end])
# entity_dict[current].append(named_entity)
# # reset our book-keeping vars
# current = Sentence.O
# start = None
# end = None
# # we have a tag!
# else:
# # our old sequence continues
# current = re.sub('(B-|I-)','', str(current))
# tok = re.sub('(B-|I-)','', str(tok))
# if tok == current:
# end = i
# # our old sequence has ended
# else:
# # do we have a previous NE?
# if current != Sentence.O:
# end = i
# named_entity = ' '.join(self.words[start:end])
# entity_dict[current].append(named_entity)
# # update our book-keeping vars
# current = tok
# start = i
# end = None
# # this might be empty
# return entity_dict
# def bag_of_labeled_dependencies_using(self, form):
# """
# Produces a list of syntactic dependencies
# where each edge is labeled with its grammatical relation.
# """
# tokens = self._get_tokens(form)
# return self.labeled_dependencies_from_tokens(tokens) if tokens else None
# def bag_of_unlabeled_dependencies_using(self, form):
# """
# Produces a list of syntactic dependencies
# where each edge is left unlabeled without its grammatical relation.
# """
# tokens = self._get_tokens(form)
# return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None
# def labeled_dependencies_from_tokens(self, tokens):
# """
# Generates a list of labeled dependencies for a sentence
# using the provided tokens
# """
# deps = self.dependencies
# labeled = []
# return [(tokens[out], rel, tokens[dest]) \
# for out in deps.outgoing \
# for (dest, rel) in deps.outgoing[out]]
# def unlabeled_dependencies_from_tokens(self, tokens):
# """
# Generate a list of unlabeled dependencies for a sentence
# using the provided tokens
# """
# return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)]
# def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):
# return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices)
Classes
class Sentence (**data: Any)
-
Usage docs: https://docs.pydantic.dev/2.9/concepts/models/
A base class for creating Pydantic models.
Attributes
__class_vars__
- The names of the class variables defined on the model.
__private_attributes__
- Metadata about the private attributes of the model.
__signature__
- The synthesized
__init__
[Signature
][inspect.Signature] of the model. __pydantic_complete__
- Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__
- The core schema of the model.
__pydantic_custom_init__
- Whether the model has a custom
__init__
function. __pydantic_decorators__
- Metadata containing the decorators defined on the model.
This replaces
Model.__validators__
andModel.__root_validators__
from Pydantic V1. __pydantic_generic_metadata__
- Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__
- Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__
- The name of the post-init method for the model, if defined.
__pydantic_root_model__
- Whether the model is a [
RootModel
][pydantic.root_model.RootModel]. __pydantic_serializer__
- The
pydantic-core
SchemaSerializer
used to dump instances of the model. __pydantic_validator__
- The
pydantic-core
SchemaValidator
used to validate instances of the model. __pydantic_extra__
- A dictionary containing extra values, if [
extra
][pydantic.config.ConfigDict.extra] is set to'allow'
. __pydantic_fields_set__
- The names of fields explicitly set during instantiation.
__pydantic_private__
- Values of private attributes set on the model instance.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError
][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.self
is explicitly positional-only to allowself
as a field name.Expand source code
class Sentence(BaseModel): UNKNOWN: typing.ClassVar[str] = Labels.UNKNOWN # the O in IOB notation O: typing.ClassVar[str] = Labels.O """ Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala) """ model_config = ConfigDict(populate_by_name=True) text: typing.Optional[str] = Field(default=None, description=" The text of the `Sentence`.", exclude=True) raw: list[str] = Field(description="Raw tokens in this sentence; these are expected to match the original text") words: list[str] = Field(description="A list of the `Sentence`'s tokens.") start_offsets: list[int] = Field(alias="startOffsets", description="The character offsets starting each token (inclusive).") end_offsets: list[int] = Field(alias="endOffsets", description="The character offsets marking the end of each token (exclusive).") tags: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.") lemmas: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using lemmas.") norms: typing.Optional[list[str]] = Field(default=None, description="Normalized values of named/numeric entities, such as dates.") chunks: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).") entities: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.") graphs: dict[str, DirectedGraph] = Field(description="A dictionary (str -> `lum.clu.processors.doc.DirectedGraph`) mapping the graph type/name to a `lum.clu.processors.doc.DirectedGraph`.") @model_validator(mode="before") @classmethod def raw_or_words(cls, data: typing.Any) -> typing.Any: """if `raw` is not present, use `words` in its place.""" if isinstance(data, dict): words = data.get("words", None) raw = data.get("raw", None) if raw is None: data["raw"] = words return data # length : int # The number of tokens in the `Sentence` # basic_dependencies : lum.clu.processors.doc.DirectedGraph # A `lum.clu.processors.doc.DirectedGraph` using basic Stanford dependencies. # collapsed_dependencies : lum.clu.processors.doc.DirectedGraph # A `lum.clu.processors.doc.DirectedGraph` using collapsed Stanford dependencies. # dependencies : lum.clu.processors.doc.DirectedGraph # A pointer to the prefered syntactic dependency graph type for this `Sentence`. # _entities : [str] # The IOB-style Named Entity (NE) labels corresponding to each token. # _chunks : [str] # The IOB-style chunk labels corresponding to each token. # nes : dict # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities` # phrases : dict # A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks` # Methods # ------- # bag_of_labeled_dependencies_using(form) # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation. # bag_of_unlabeled_dependencies_using(form) # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation. @property def length(self) -> int: return len(self.raw) # self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None) # self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None) # self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies # # IOB tokens -> {label: [phrase 1, ..., phrase n]} # self.nes = self._handle_iob(self._entities) # self.phrases = self._handle_iob(self._chunks) # def __eq__(self, other): # if isinstance(other, self.__class__): # return self.to_JSON() == other.to_JSON() # else: # return False # def __ne__(self, other): # return not self.__eq__(other) # def __hash__(self): # return hash(self.to_JSON(pretty=False)) # def deduplication_hash(self): # """ # Generates a deduplication hash for the sentence # """ # return hashlib.sha256(self.to_JSON(pretty=False).encode()).hexdigest() # def _get_tokens(self, form): # f = form.lower() # if f == "words": # tokens = self.words # elif f == "tags": # tokens = self.tags # elif f == "lemmas": # tokens = self.lemmas # elif f == "entities": # tokens = self.nes # elif f == "index": # tokens = list(range(self.length)) # # unrecognized form # else: # raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""") # return tokens # def _set_toks(self, toks): # return toks if toks else [Sentence.UNKNOWN]*self.length # def _handle_iob(self, iob): # """ # Consolidates consecutive tokens in IOB notation under the appropriate label. # Regexs control for bionlp annotator, which uses IOB notation. # """ # entity_dict = defaultdict(list) # # initialize to empty label # current = Sentence.O # start = None # end = None # for i, tok in enumerate(iob): # # we don't have an I or O # if tok == Sentence.O: # # did we have an entity with the last token? # current = re.sub('(B-|I-)','', str(current)) # if current == Sentence.O: # continue # else: # # the last sequence has ended # end = i # # store the entity # named_entity = ' '.join(self.words[start:end]) # entity_dict[current].append(named_entity) # # reset our book-keeping vars # current = Sentence.O # start = None # end = None # # we have a tag! # else: # # our old sequence continues # current = re.sub('(B-|I-)','', str(current)) # tok = re.sub('(B-|I-)','', str(tok)) # if tok == current: # end = i # # our old sequence has ended # else: # # do we have a previous NE? # if current != Sentence.O: # end = i # named_entity = ' '.join(self.words[start:end]) # entity_dict[current].append(named_entity) # # update our book-keeping vars # current = tok # start = i # end = None # # this might be empty # return entity_dict # def bag_of_labeled_dependencies_using(self, form): # """ # Produces a list of syntactic dependencies # where each edge is labeled with its grammatical relation. # """ # tokens = self._get_tokens(form) # return self.labeled_dependencies_from_tokens(tokens) if tokens else None # def bag_of_unlabeled_dependencies_using(self, form): # """ # Produces a list of syntactic dependencies # where each edge is left unlabeled without its grammatical relation. # """ # tokens = self._get_tokens(form) # return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None # def labeled_dependencies_from_tokens(self, tokens): # """ # Generates a list of labeled dependencies for a sentence # using the provided tokens # """ # deps = self.dependencies # labeled = [] # return [(tokens[out], rel, tokens[dest]) \ # for out in deps.outgoing \ # for (dest, rel) in deps.outgoing[out]] # def unlabeled_dependencies_from_tokens(self, tokens): # """ # Generate a list of unlabeled dependencies for a sentence # using the provided tokens # """ # return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)] # def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None): # return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices)
Ancestors
- pydantic.main.BaseModel
Class variables
var O : ClassVar[str]
-
Storage class for an annotated sentence. Based on
org.clulab.processors.Sentence
var UNKNOWN : ClassVar[str]
var chunks : Optional[list[str]]
var end_offsets : list[int]
var entities : Optional[list[str]]
var graphs : dict[str, DirectedGraph]
var lemmas : Optional[list[str]]
var model_computed_fields
var model_config
var model_fields
var norms : Optional[list[str]]
var raw : list[str]
var start_offsets : list[int]
var text : Optional[str]
var words : list[str]
Static methods
def raw_or_words(data: typing.Any) ‑> Any
-
if
raw
is not present, usewords
in its place.Expand source code
@model_validator(mode="before") @classmethod def raw_or_words(cls, data: typing.Any) -> typing.Any: """if `raw` is not present, use `words` in its place.""" if isinstance(data, dict): words = data.get("words", None) raw = data.get("raw", None) if raw is None: data["raw"] = words return data
Instance variables
var length : int
-
Expand source code
@property def length(self) -> int: return len(self.raw) # self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None) # self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None) # self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies # # IOB tokens -> {label: [phrase 1, ..., phrase n]} # self.nes = self._handle_iob(self._entities) # self.phrases = self._handle_iob(self._chunks)