Coverage for python/lum/clu/processors/document.py: 100%

1from __future__ import annotations

2from pydantic import BaseModel, Field, ConfigDict

3from lum.clu.processors.sentence import Sentence

4from lum.clu.processors.utils import Labels

5import typing

8class Document(BaseModel):

10 """

11 Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)

12 """

14 model_config = ConfigDict(populate_by_name=True)

16 id: typing.Optional[str] = Field(default=None, description="A unique ID for the `Document`.")

18 text: typing.Optional[str] = Field(default=None, description=" The text of the `Document`.")

20 sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.")

22 @staticmethod

23 def merge_documents(docs: list[Document]) -> Document:

24 """Merges two or more Documents into a single Document."""

25 text = ""

26 sentences = []

27 offset = 0

28 for doc in docs:

29 for old in doc.sentences:

30 s = Sentence(

31 text = old.text,

32 raw = old.raw,

33 words = old.words,

34 startOffsets = [i + offset for i in old.start_offsets],

35 endOffsets = [i + offset for i in old.end_offsets],

36 tags= old.tags,

37 lemmas = old.lemmas,

38 norms = old.norms,

39 chunks = old.chunks,

40 entities = old.entities,

41 graphs = old.graphs

42 )

43 sentences.append(s)

44 if doc.text:

45 text += doc.text

46 offset += len(doc.text)

48 return Document(

49 id = docs[0].id,

50 text=text if len(text) > 0 else None,

51 sentences=sentences

52 )

54 # size : int

55 # The number of `sentences`.

57 # words : [str]

58 # A list of the `Document`'s tokens.

60 # tags : [str]

61 # A list of the `Document`'s tokens represented using part of speech (PoS) tags.

63 # lemmas : [str]

64 # A list of the `Document`'s tokens represented using lemmas.

66 # _entities : [str]

67 # A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.

69 # nes : dict

70 # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.

72 # bag_of_labeled_deps : [str]

73 # The labeled dependencies from all sentences in the `Document`.

75 # bag_of_unlabeled_deps : [str]

76 # The unlabeled dependencies from all sentences in the `Document`.

78 # text : str or None

79 # The original text of the `Document`.

81 # Methods

82 # -------

83 # bag_of_labeled_dependencies_using(form)

84 # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.

86 # bag_of_unlabeled_dependencies_using(form)

87 # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.

90 # self.nes = merge_entity_dicts = self._merge_ne_dicts()

91 # self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))

92 # self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))

94 # def __hash__(self):

95 # return hash(self.to_JSON())

97 # def __unicode__(self):

98 # return self.text

100 # def __str__(self):

101 # return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")

102

103 # def __eq__(self, other):

104 # if isinstance(other, self.__class__):

105 # return self.to_JSON() == other.to_JSON()

106 # else:

107 # return False

108

109 # def __ne__(self, other):

110 # return not self.__eq__(other)

111

112 # def bag_of_labeled_dependencies_using(self, form):

113 # return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))

114

115 # def bag_of_unlabeled_dependencies_using(self, form):

116 # return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))

117

118 # def _merge_ne_dicts(self):

119 # # Get the set of all NE labels found in the Doc's sentences

120 # entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))

121 # # Do we have any labels?

122 # if entity_labels == None:

123 # return None

124 # # If we have labels, consolidate the NEs under the appropriate label

125 # else:

126 # nes_dict = dict()

127 # for e in entity_labels:

128 # entities = []

129 # for s in self.sentences:

130 # entities += s.nes[e]

131 # nes_dict[e] = entities

132 # return nes_dict