Coverage for python/lum/clu/processors/document.py: 100%

23 statements  

« prev     ^ index     » next       coverage.py v7.6.7, created at 2024-11-17 18:41 +0000

1from __future__ import annotations 

2from pydantic import BaseModel, Field, ConfigDict 

3from lum.clu.processors.sentence import Sentence 

4from lum.clu.processors.utils import Labels 

5import typing 

6 

7 

8class Document(BaseModel): 

9 

10 """ 

11 Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala) 

12 """ 

13 

14 model_config = ConfigDict(populate_by_name=True) 

15 

16 id: typing.Optional[str] = Field(default=None, description="A unique ID for the `Document`.") 

17 

18 text: typing.Optional[str] = Field(default=None, description=" The text of the `Document`.") 

19 

20 sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.") 

21 

22 @staticmethod 

23 def merge_documents(docs: list[Document]) -> Document: 

24 """Merges two or more Documents into a single Document.""" 

25 text = "" 

26 sentences = [] 

27 offset = 0 

28 for doc in docs: 

29 for old in doc.sentences: 

30 s = Sentence( 

31 text = old.text, 

32 raw = old.raw, 

33 words = old.words, 

34 startOffsets = [i + offset for i in old.start_offsets], 

35 endOffsets = [i + offset for i in old.end_offsets], 

36 tags= old.tags, 

37 lemmas = old.lemmas, 

38 norms = old.norms, 

39 chunks = old.chunks, 

40 entities = old.entities, 

41 graphs = old.graphs 

42 ) 

43 sentences.append(s) 

44 if doc.text: 

45 text += doc.text 

46 offset += len(doc.text) 

47 

48 return Document( 

49 id = docs[0].id, 

50 text=text if len(text) > 0 else None, 

51 sentences=sentences 

52 ) 

53 

54 # size : int 

55 # The number of `sentences`. 

56 

57 # words : [str] 

58 # A list of the `Document`'s tokens. 

59 

60 # tags : [str] 

61 # A list of the `Document`'s tokens represented using part of speech (PoS) tags. 

62 

63 # lemmas : [str] 

64 # A list of the `Document`'s tokens represented using lemmas. 

65 

66 # _entities : [str] 

67 # A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels. 

68 

69 # nes : dict 

70 # A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans. 

71 

72 # bag_of_labeled_deps : [str] 

73 # The labeled dependencies from all sentences in the `Document`. 

74 

75 # bag_of_unlabeled_deps : [str] 

76 # The unlabeled dependencies from all sentences in the `Document`. 

77 

78 # text : str or None 

79 # The original text of the `Document`. 

80 

81 # Methods 

82 # ------- 

83 # bag_of_labeled_dependencies_using(form) 

84 # Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation. 

85 

86 # bag_of_unlabeled_dependencies_using(form) 

87 # Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation. 

88 

89 

90 # self.nes = merge_entity_dicts = self._merge_ne_dicts() 

91 # self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences])) 

92 # self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences])) 

93 

94 # def __hash__(self): 

95 # return hash(self.to_JSON()) 

96 

97 # def __unicode__(self): 

98 # return self.text 

99 

100 # def __str__(self): 

101 # return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s") 

102 

103 # def __eq__(self, other): 

104 # if isinstance(other, self.__class__): 

105 # return self.to_JSON() == other.to_JSON() 

106 # else: 

107 # return False 

108 

109 # def __ne__(self, other): 

110 # return not self.__eq__(other) 

111 

112 # def bag_of_labeled_dependencies_using(self, form): 

113 # return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences])) 

114 

115 # def bag_of_unlabeled_dependencies_using(self, form): 

116 # return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences])) 

117 

118 # def _merge_ne_dicts(self): 

119 # # Get the set of all NE labels found in the Doc's sentences 

120 # entity_labels = set(chain(*[s.nes.keys() for s in self.sentences])) 

121 # # Do we have any labels? 

122 # if entity_labels == None: 

123 # return None 

124 # # If we have labels, consolidate the NEs under the appropriate label 

125 # else: 

126 # nes_dict = dict() 

127 # for e in entity_labels: 

128 # entities = [] 

129 # for s in self.sentences: 

130 # entities += s.nes[e] 

131 # nes_dict[e] = entities 

132 # return nes_dict