Module `lum.odinson.rest.api`

Expand source code

from __future__ import annotations
from typing import Any, Dict, Iterator, List, Literal, Optional, Text, Union
from lum.odinson.doc import AnyField, Document, Sentence
from lum.odinson.rest.responses import (
    CorpusInfo,
    OdinsonErrors,
    ScoreDoc,
    Statistic,
    GrammarResults,
    Results,
)
from lum.odinson.rest.requests import GrammarRequest, SimplePatternsRequest
from pydantic import BaseModel
from dataclasses import dataclass
import pydantic
import json
import requests
import urllib.parse

__all__ = ["OdinsonBaseAPI"]

# __all__ = ["Results", "Result", "Match", "Interval"]


class OdinsonBaseAPI:
    def __init__(self, address: Text):
        self.address = address

    @staticmethod
    def status_code_to_bool(code: int) -> bool:
        return True if code == requests.codes.ok else False

    def __len__(self) -> int:
        return self.numdocs

    @property
    def numdocs(self) -> int:
        """Total number of documents (num. docs = num. sentences) in the corpus."""
        endpoint = f"{self.address}/api/numdocs"
        return requests.get(endpoint).json()

    @property
    def tags_vocabulary(self) -> List[str]:
        """Retrieves vocabulary of part-of-speech tags for the current index."""
        endpoint = f"{self.address}/api/tags-vocabulary"
        return requests.get(endpoint).json()

    @property
    def edge_vocabulary(self) -> List[str]:
        """Retrieves vocabulary of dependencies for the current index."""
        # FIXME: change this to edge-vocabulary
        endpoint = f"{self.address}/api/dependencies-vocabulary"
        return requests.get(endpoint).json()

    def corpus(self) -> CorpusInfo:
        """Provides a summary of the current index"""
        endpoint = f"{self.address}/api/corpus"
        # return requests.get(endpoint).json()
        return CorpusInfo(**requests.get(endpoint).json())

    # api/config
    def buildinfo(self) -> Dict[str, Union[str, List[str], bool]]:
        """Provides detailed build information about the currently running app."""
        endpoint = f"{self.address}/api/buildinfo"
        return requests.get(endpoint).json()

    # api/config
    def _config(self) -> Dict[str, Any]:
        """Provides detailed build information about the currently running app."""
        endpoint = f"{self.address}/api/config"
        return requests.get(endpoint).json()

    def term_freq(self) -> List[Statistic]:
        pass

    def rule_freq(
        self,
        # An Odinson grammar.
        grammar: str,
        # Whether or not event arguments are permitted to overlap with the event's trigger. Defaults to false.
        allow_trigger_overlaps: bool = False,
        # The order in which to return results: "freq" (frequency order, default) or "alpha" (alphanumeric order).
        order: Literal["freq", "alpha"] = "freq",
        # The smallest rank to return, with 0 (default) being the highest ranked.
        min: int = 0,
        # The highest rank to return, e.g. 9 (default).
        max: int = 0,
        # Scaling to apply to frequency counts. Choices are "count" (default), "log10", and "percent".
        scale: Literal["count", "log10", "percent"] = "count",
        # Whether to reverse the rank order, to select the 10 lease frequent results, for example.
        reverse: bool = False,
    ) -> List[Statistic]:
        payload = {
            "grammar": grammar,
            "allowTriggerOverlaps": allow_trigger_overlaps,
            "order": order,
            "min": min,
            "max": max,
            "scale": scale,
            "reverse": reverse,
            "pretty": False,
        }
        endpoint = f"{self.address}/api/rule-freq"
        return requests.post(endpoint, json=payload).json()

    def _post_doc(
        self, endpoint: str, doc: Document, headers: Optional[Dict[str, str]] = None
    ) -> requests.Response:
        return requests.post(
            endpoint,
            json=doc.dict(),
            # NOTE: data takes str & .json() returns json str
            # strange as it seems, this round trip is seems necessary for at least some files
            # data=json.dumps(json.loads(doc.json())),
            headers=headers,
        )

    def _post_text(
        self, 
        endpoint: str, 
        text: str, 
        params: Optional[Dict[str, Union[str,int]]] = None,
        headers: Optional[Dict[str, str]] = None
    ) -> requests.Response:
        return requests.post(
            endpoint,
            # NOTE: data takes str & .json() returns json str
            # json=text,
            data=text,
            params=params,
            headers=headers,
        )

    def validate_document(self, doc: Document, strict: bool = True) -> bool:
        """Inspects and validates an OdinsonDocument"""
        endpoint = (
            f"{self.address}/api/validate/document/strict"
            if strict
            else f"{self.address}/api/validate/document/relaxed"
        )
        res = self._post_doc(endpoint=endpoint, doc=doc)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def validate_rule(
        self, rule: str, verbose: bool = False
    ) -> Union[bool, OdinsonErrors]:
        """Inspects and validates an Odinson rule"""
        endpoint = f"{self.address}/api/validate/rule"
        res = self._post_text(endpoint=endpoint, text=rule)
        if res.status_code == 200:
            return OdinsonBaseAPI.status_code_to_bool(res.status_code)
        else:
            return False if not verbose else OdinsonErrors.model_validate(res.json())

    def validate_grammar(
        self, grammar: str, verbose: bool = False
    ) -> Union[bool, OdinsonErrors]:
        """Inspects and validates an Odinson grammar"""
        endpoint = f"{self.address}/api/validate/grammar"
        res = self._post_text(endpoint=endpoint, text=grammar)
        if res.status_code == 200:
            return OdinsonBaseAPI.status_code_to_bool(res.status_code)
        else:
            return False if not verbose else OdinsonErrors.model_validate(res.json())

    def index(self, doc: Document, max_tokens: int = -1) -> bool:
        """Indexes a single Document"""
        # endpoint = f"{self.address}/api/index/document"
        endpoint = (
            f"{self.address}/api/index/document/maxTokensPerSentence/{max_tokens}"
        )
        # NOTE: data takes str & .json() returns json str
        headers = {"Content-type": "application/json", "Accept": "text/plain"}
        res = self._post_doc(endpoint=endpoint, doc=doc, headers=headers)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def update(self, doc: Document, max_tokens: Optional[int] = None) -> bool:
        """Updates an OdinsonDocument in the index, allowing for a specified maximum number of tokens per sentence."""
        # f"{self.address}/api/update/document/{urllib.parse.quote(doc.id)}"
        endpoint = (
            f"{self.address}/api/update/document"
            if not max_tokens
            else f"{self.address}/api/update/document/maxTokensPerSentence/{max_tokens}"
        )
        res = self._post_doc(endpoint=endpoint, doc=doc)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def delete(self, doc_or_id: Union[Document, Text]) -> bool:
        """Removes an OdinsonDocument from the index."""
        doc_id: Text = doc_or_id if isinstance(doc_or_id, Text) else doc_or_id.id
        endpoint = f"{self.address}/api/delete/document/{urllib.parse.quote(doc_id)}"
        res = requests.delete(endpoint)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def sentence(self, sentence_id: int) -> Sentence:
        """Retrieves an Odinson Sentence from the doc store."""
        endpoint = f"{self.address}/api/sentence/{sentence_id}"
        res = requests.get(endpoint)
        return Sentence.model_validate(res.json())

    def document(self, document_id: str) -> Document:
        """Retrieves an Odinson Document from the doc store."""
        endpoint = f"{self.address}/api/document/{document_id}"
        res = requests.get(endpoint)
        return Document.model_validate(res.json())

    def metadata_for_sentence(self, sentence_id: str) -> List[AnyField]:
        """Retrieves Odinson Document Metadata from the doc store."""
        endpoint = f"{self.address}/api/metadata/sentence/{sentence_id}"
        res = requests.get(endpoint)
        doc = Document.model_validate(
            {"id": "UNK", "metadata": res.json(), "sentences": []}
        )
        return doc.metadata

    def metadata_for_document(self, document_id: str) -> List[AnyField]:
        """Retrieves Odinson Document Metadata from the doc store."""
        endpoint = f"{self.address}/api/metadata/document/{document_id}"
        res = requests.get(endpoint)
        # print(res.json())
        doc = Document.model_validate(
            {"id": document_id, "metadata": res.json(), "sentences": []}
        )
        return doc.metadata

    def metadata(self, id: Union[str, int]) -> List[AnyField]:
        """Retrieves Odinson Document Metadata from the doc store."""
        if isinstance(id, str):
            return self.metadata_for_document(id)
        elif isinstance(id, int):
            return self.metadata_for_sentence(id)

    # TODO: /api/parent/sentence/:sentenceId
    # TODO: /api/metadata/document/:odinsonDocId
    # TODO: /api/metadata/sentence/:sentenceId

    def _search(
        self,
        # An Odinson pattern.
        # Example: [lemma=pie] []
        odinson_query: str,
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        # The label to use when committing mentions to the State.
        # Example: character contains 'Special Agent'
        label: Optional[str] = None,
        # Whether or not the results of this query should be committed to the State.
        commit: bool = False,
        # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
        prev_doc: Optional[int] = None,
        # The score for the last result seen in the previous page of results.
        prev_score: Optional[float] = None,
    ) -> Results:  # -> Iterator[S]:
        endpoint = f"{self.address}/api/execute/pattern"
        params = {
            "odinsonQuery": odinson_query,
            "metadataQuery": metadata_query,
            "label": label,
            "commit": commit,
            "prevDoc": prev_doc,
            "prevScore": prev_score,
        }
        params = {k: v for (k, v) in params.items() if v}
        # print(params)
        res = requests.get(endpoint, params=params)
        # print(res)
        return Results.empty() if res.status_code != 200 else Results(**res.json())

    def execute_grammar(
        self,
        grammar: str,
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        max_docs: Optional[int] = 20,
        allow_trigger_overlaps: bool = False,
    ):
        endpoint = f"{self.address}/api/execute/grammar"
        params = {
            "metadataQuery" : metadata_query,
            "maxDocs" : max_docs,
            "allowTriggerOverlaps" : allow_trigger_overlaps
        }
        res = self._post_text(endpoint=endpoint, text=grammar, params=params)
        # return GrammarResults.empty() if res.status_code != 200 else GrammarResults(**res.json())
        # FIXME: check status code and return error or empty results?
        return GrammarResults(**res.json())

    def search(
        self,
        # An Odinson pattern.
        # Example: [lemma=pie] []
        odinson_query: str,
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        # The label to use when committing mentions to the State.
        # Example: character contains 'Special Agent'
        label: Optional[str] = None,
        # Whether or not the results of this query should be committed to the State.
        commit: bool = False,
        # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
        prev_doc: Optional[int] = None,
        # The score for the last result seen in the previous page of results.
        prev_score: Optional[float] = None,
    ) -> Iterator[ScoreDoc]:
        endpoint = f"{self.address}/api/execute/pattern"
        seen = 0
        results: Results = self._search(
            odinson_query=odinson_query,
            metadata_query=metadata_query,
            label=label,
            commit=commit,
            prev_doc=prev_doc,
        )
        total = results.total_hits
        if total == 0:
            return iter(())
        last = results.score_docs[-1]
        while seen < total:
            for sd in results.score_docs:
                seen += 1
                last = sd
                # print(f"{seen-1}/{total}")
                # print(f"sd.document_id:\t{sd.document_id}")
                # print(f"sd.sentence_id:\t{sd.sentence_id}\n")
                # FIXME: should this be a Results() with a single doc?
                yield sd
            # paginate
            results: Results = self._search(
                odinson_query=odinson_query,
                metadata_query=metadata_query,
                label=label,
                commit=commit,
                prev_doc=last.sentence_id,
            )
            # print(f"total_hits:\t{results.total_hits}")

    def search_disjunction_of_patterns(
        self,
        # An Odinson pattern.
        # Example: ["[lemma=pie] []", "[lemma=blarg]"]
        patterns: list[str],
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        # The label to use when committing mentions to the State.
        # Example: character contains 'Special Agent'
        label: Optional[str] = None,
        # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
        prev_doc: Optional[int] = None,
        # The score for the last result seen in the previous page of results.
        prev_score: Optional[float] = None,
    ) -> Iterator[ScoreDoc]:
        endpoint = f"{self.address}/api/execute/disjunction-of-patterns"

        spr = SimplePatternsRequest(
            patterns=patterns,
            metadataQuery=metadata_query,
            prevDoc=prev_doc,
            prevScore=prev_score,
        )
        results: Results = Results(**requests.post(endpoint, json=spr.dict()).json())

        seen = 0
        total = results.total_hits
        if total == 0:
            return iter(())
        last = results.score_docs[-1]
        while seen < total:
            for sd in results.score_docs:
                seen += 1
                last = sd
                # print(f"{seen-1}/{total}")
                # print(f"sd.document_id:\t{sd.document_id}")
                # print(f"sd.sentence_id:\t{sd.sentence_id}\n")
                # FIXME: should this be a Results() with a single doc?
                yield sd
            # paginate
            nspr = SimplePatternsRequest(
                patterns=patterns,
                metadata_query=metadata_query,
                label=label,
                prev_doc=last.sentence_id,
            )
            results: Results = Results(
                **requests.post(
                    endpoint,
                    json=nspr.dict(),
                ).json()
            )
            # print(f"total_hits:\t{results.total_hits}")

    # TODO: add rewrite method
    # for any token that matches the pattern, replace its entry in field <field> with <label>
    # ex [word="Table" & tag=/NNP.*/] -> {scratch: "CAPTION"}

Classes

class OdinsonBaseAPI (address: Text)

Expand source code

class OdinsonBaseAPI:
    def __init__(self, address: Text):
        self.address = address

    @staticmethod
    def status_code_to_bool(code: int) -> bool:
        return True if code == requests.codes.ok else False

    def __len__(self) -> int:
        return self.numdocs

    @property
    def numdocs(self) -> int:
        """Total number of documents (num. docs = num. sentences) in the corpus."""
        endpoint = f"{self.address}/api/numdocs"
        return requests.get(endpoint).json()

    @property
    def tags_vocabulary(self) -> List[str]:
        """Retrieves vocabulary of part-of-speech tags for the current index."""
        endpoint = f"{self.address}/api/tags-vocabulary"
        return requests.get(endpoint).json()

    @property
    def edge_vocabulary(self) -> List[str]:
        """Retrieves vocabulary of dependencies for the current index."""
        # FIXME: change this to edge-vocabulary
        endpoint = f"{self.address}/api/dependencies-vocabulary"
        return requests.get(endpoint).json()

    def corpus(self) -> CorpusInfo:
        """Provides a summary of the current index"""
        endpoint = f"{self.address}/api/corpus"
        # return requests.get(endpoint).json()
        return CorpusInfo(**requests.get(endpoint).json())

    # api/config
    def buildinfo(self) -> Dict[str, Union[str, List[str], bool]]:
        """Provides detailed build information about the currently running app."""
        endpoint = f"{self.address}/api/buildinfo"
        return requests.get(endpoint).json()

    # api/config
    def _config(self) -> Dict[str, Any]:
        """Provides detailed build information about the currently running app."""
        endpoint = f"{self.address}/api/config"
        return requests.get(endpoint).json()

    def term_freq(self) -> List[Statistic]:
        pass

    def rule_freq(
        self,
        # An Odinson grammar.
        grammar: str,
        # Whether or not event arguments are permitted to overlap with the event's trigger. Defaults to false.
        allow_trigger_overlaps: bool = False,
        # The order in which to return results: "freq" (frequency order, default) or "alpha" (alphanumeric order).
        order: Literal["freq", "alpha"] = "freq",
        # The smallest rank to return, with 0 (default) being the highest ranked.
        min: int = 0,
        # The highest rank to return, e.g. 9 (default).
        max: int = 0,
        # Scaling to apply to frequency counts. Choices are "count" (default), "log10", and "percent".
        scale: Literal["count", "log10", "percent"] = "count",
        # Whether to reverse the rank order, to select the 10 lease frequent results, for example.
        reverse: bool = False,
    ) -> List[Statistic]:
        payload = {
            "grammar": grammar,
            "allowTriggerOverlaps": allow_trigger_overlaps,
            "order": order,
            "min": min,
            "max": max,
            "scale": scale,
            "reverse": reverse,
            "pretty": False,
        }
        endpoint = f"{self.address}/api/rule-freq"
        return requests.post(endpoint, json=payload).json()

    def _post_doc(
        self, endpoint: str, doc: Document, headers: Optional[Dict[str, str]] = None
    ) -> requests.Response:
        return requests.post(
            endpoint,
            json=doc.dict(),
            # NOTE: data takes str & .json() returns json str
            # strange as it seems, this round trip is seems necessary for at least some files
            # data=json.dumps(json.loads(doc.json())),
            headers=headers,
        )

    def _post_text(
        self, 
        endpoint: str, 
        text: str, 
        params: Optional[Dict[str, Union[str,int]]] = None,
        headers: Optional[Dict[str, str]] = None
    ) -> requests.Response:
        return requests.post(
            endpoint,
            # NOTE: data takes str & .json() returns json str
            # json=text,
            data=text,
            params=params,
            headers=headers,
        )

    def validate_document(self, doc: Document, strict: bool = True) -> bool:
        """Inspects and validates an OdinsonDocument"""
        endpoint = (
            f"{self.address}/api/validate/document/strict"
            if strict
            else f"{self.address}/api/validate/document/relaxed"
        )
        res = self._post_doc(endpoint=endpoint, doc=doc)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def validate_rule(
        self, rule: str, verbose: bool = False
    ) -> Union[bool, OdinsonErrors]:
        """Inspects and validates an Odinson rule"""
        endpoint = f"{self.address}/api/validate/rule"
        res = self._post_text(endpoint=endpoint, text=rule)
        if res.status_code == 200:
            return OdinsonBaseAPI.status_code_to_bool(res.status_code)
        else:
            return False if not verbose else OdinsonErrors.model_validate(res.json())

    def validate_grammar(
        self, grammar: str, verbose: bool = False
    ) -> Union[bool, OdinsonErrors]:
        """Inspects and validates an Odinson grammar"""
        endpoint = f"{self.address}/api/validate/grammar"
        res = self._post_text(endpoint=endpoint, text=grammar)
        if res.status_code == 200:
            return OdinsonBaseAPI.status_code_to_bool(res.status_code)
        else:
            return False if not verbose else OdinsonErrors.model_validate(res.json())

    def index(self, doc: Document, max_tokens: int = -1) -> bool:
        """Indexes a single Document"""
        # endpoint = f"{self.address}/api/index/document"
        endpoint = (
            f"{self.address}/api/index/document/maxTokensPerSentence/{max_tokens}"
        )
        # NOTE: data takes str & .json() returns json str
        headers = {"Content-type": "application/json", "Accept": "text/plain"}
        res = self._post_doc(endpoint=endpoint, doc=doc, headers=headers)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def update(self, doc: Document, max_tokens: Optional[int] = None) -> bool:
        """Updates an OdinsonDocument in the index, allowing for a specified maximum number of tokens per sentence."""
        # f"{self.address}/api/update/document/{urllib.parse.quote(doc.id)}"
        endpoint = (
            f"{self.address}/api/update/document"
            if not max_tokens
            else f"{self.address}/api/update/document/maxTokensPerSentence/{max_tokens}"
        )
        res = self._post_doc(endpoint=endpoint, doc=doc)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def delete(self, doc_or_id: Union[Document, Text]) -> bool:
        """Removes an OdinsonDocument from the index."""
        doc_id: Text = doc_or_id if isinstance(doc_or_id, Text) else doc_or_id.id
        endpoint = f"{self.address}/api/delete/document/{urllib.parse.quote(doc_id)}"
        res = requests.delete(endpoint)
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)

    def sentence(self, sentence_id: int) -> Sentence:
        """Retrieves an Odinson Sentence from the doc store."""
        endpoint = f"{self.address}/api/sentence/{sentence_id}"
        res = requests.get(endpoint)
        return Sentence.model_validate(res.json())

    def document(self, document_id: str) -> Document:
        """Retrieves an Odinson Document from the doc store."""
        endpoint = f"{self.address}/api/document/{document_id}"
        res = requests.get(endpoint)
        return Document.model_validate(res.json())

    def metadata_for_sentence(self, sentence_id: str) -> List[AnyField]:
        """Retrieves Odinson Document Metadata from the doc store."""
        endpoint = f"{self.address}/api/metadata/sentence/{sentence_id}"
        res = requests.get(endpoint)
        doc = Document.model_validate(
            {"id": "UNK", "metadata": res.json(), "sentences": []}
        )
        return doc.metadata

    def metadata_for_document(self, document_id: str) -> List[AnyField]:
        """Retrieves Odinson Document Metadata from the doc store."""
        endpoint = f"{self.address}/api/metadata/document/{document_id}"
        res = requests.get(endpoint)
        # print(res.json())
        doc = Document.model_validate(
            {"id": document_id, "metadata": res.json(), "sentences": []}
        )
        return doc.metadata

    def metadata(self, id: Union[str, int]) -> List[AnyField]:
        """Retrieves Odinson Document Metadata from the doc store."""
        if isinstance(id, str):
            return self.metadata_for_document(id)
        elif isinstance(id, int):
            return self.metadata_for_sentence(id)

    # TODO: /api/parent/sentence/:sentenceId
    # TODO: /api/metadata/document/:odinsonDocId
    # TODO: /api/metadata/sentence/:sentenceId

    def _search(
        self,
        # An Odinson pattern.
        # Example: [lemma=pie] []
        odinson_query: str,
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        # The label to use when committing mentions to the State.
        # Example: character contains 'Special Agent'
        label: Optional[str] = None,
        # Whether or not the results of this query should be committed to the State.
        commit: bool = False,
        # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
        prev_doc: Optional[int] = None,
        # The score for the last result seen in the previous page of results.
        prev_score: Optional[float] = None,
    ) -> Results:  # -> Iterator[S]:
        endpoint = f"{self.address}/api/execute/pattern"
        params = {
            "odinsonQuery": odinson_query,
            "metadataQuery": metadata_query,
            "label": label,
            "commit": commit,
            "prevDoc": prev_doc,
            "prevScore": prev_score,
        }
        params = {k: v for (k, v) in params.items() if v}
        # print(params)
        res = requests.get(endpoint, params=params)
        # print(res)
        return Results.empty() if res.status_code != 200 else Results(**res.json())

    def execute_grammar(
        self,
        grammar: str,
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        max_docs: Optional[int] = 20,
        allow_trigger_overlaps: bool = False,
    ):
        endpoint = f"{self.address}/api/execute/grammar"
        params = {
            "metadataQuery" : metadata_query,
            "maxDocs" : max_docs,
            "allowTriggerOverlaps" : allow_trigger_overlaps
        }
        res = self._post_text(endpoint=endpoint, text=grammar, params=params)
        # return GrammarResults.empty() if res.status_code != 200 else GrammarResults(**res.json())
        # FIXME: check status code and return error or empty results?
        return GrammarResults(**res.json())

    def search(
        self,
        # An Odinson pattern.
        # Example: [lemma=pie] []
        odinson_query: str,
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        # The label to use when committing mentions to the State.
        # Example: character contains 'Special Agent'
        label: Optional[str] = None,
        # Whether or not the results of this query should be committed to the State.
        commit: bool = False,
        # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
        prev_doc: Optional[int] = None,
        # The score for the last result seen in the previous page of results.
        prev_score: Optional[float] = None,
    ) -> Iterator[ScoreDoc]:
        endpoint = f"{self.address}/api/execute/pattern"
        seen = 0
        results: Results = self._search(
            odinson_query=odinson_query,
            metadata_query=metadata_query,
            label=label,
            commit=commit,
            prev_doc=prev_doc,
        )
        total = results.total_hits
        if total == 0:
            return iter(())
        last = results.score_docs[-1]
        while seen < total:
            for sd in results.score_docs:
                seen += 1
                last = sd
                # print(f"{seen-1}/{total}")
                # print(f"sd.document_id:\t{sd.document_id}")
                # print(f"sd.sentence_id:\t{sd.sentence_id}\n")
                # FIXME: should this be a Results() with a single doc?
                yield sd
            # paginate
            results: Results = self._search(
                odinson_query=odinson_query,
                metadata_query=metadata_query,
                label=label,
                commit=commit,
                prev_doc=last.sentence_id,
            )
            # print(f"total_hits:\t{results.total_hits}")

    def search_disjunction_of_patterns(
        self,
        # An Odinson pattern.
        # Example: ["[lemma=pie] []", "[lemma=blarg]"]
        patterns: list[str],
        # A query to filter Documents by their metadata before applying an Odinson pattern.
        metadata_query: Optional[str] = None,
        # The label to use when committing mentions to the State.
        # Example: character contains 'Special Agent'
        label: Optional[str] = None,
        # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
        prev_doc: Optional[int] = None,
        # The score for the last result seen in the previous page of results.
        prev_score: Optional[float] = None,
    ) -> Iterator[ScoreDoc]:
        endpoint = f"{self.address}/api/execute/disjunction-of-patterns"

        spr = SimplePatternsRequest(
            patterns=patterns,
            metadataQuery=metadata_query,
            prevDoc=prev_doc,
            prevScore=prev_score,
        )
        results: Results = Results(**requests.post(endpoint, json=spr.dict()).json())

        seen = 0
        total = results.total_hits
        if total == 0:
            return iter(())
        last = results.score_docs[-1]
        while seen < total:
            for sd in results.score_docs:
                seen += 1
                last = sd
                # print(f"{seen-1}/{total}")
                # print(f"sd.document_id:\t{sd.document_id}")
                # print(f"sd.sentence_id:\t{sd.sentence_id}\n")
                # FIXME: should this be a Results() with a single doc?
                yield sd
            # paginate
            nspr = SimplePatternsRequest(
                patterns=patterns,
                metadata_query=metadata_query,
                label=label,
                prev_doc=last.sentence_id,
            )
            results: Results = Results(
                **requests.post(
                    endpoint,
                    json=nspr.dict(),
                ).json()
            )
            # print(f"total_hits:\t{results.total_hits}")

    # TODO: add rewrite method
    # for any token that matches the pattern, replace its entry in field <field> with <label>
    # ex [word="Table" & tag=/NNP.*/] -> {scratch: "CAPTION"}

Subclasses

DockerBasedOdinsonAPI

Static methods

def status_code_to_bool(code: int) ‑> bool

Expand source code

@staticmethod
def status_code_to_bool(code: int) -> bool:
    return True if code == requests.codes.ok else False

Instance variables

var edge_vocabulary : List[str]

Retrieves vocabulary of dependencies for the current index.

Expand source code

@property
def edge_vocabulary(self) -> List[str]:
    """Retrieves vocabulary of dependencies for the current index."""
    # FIXME: change this to edge-vocabulary
    endpoint = f"{self.address}/api/dependencies-vocabulary"
    return requests.get(endpoint).json()

var numdocs : int

Total number of documents (num. docs = num. sentences) in the corpus.

Expand source code

@property
def numdocs(self) -> int:
    """Total number of documents (num. docs = num. sentences) in the corpus."""
    endpoint = f"{self.address}/api/numdocs"
    return requests.get(endpoint).json()

var tags_vocabulary : List[str]

Retrieves vocabulary of part-of-speech tags for the current index.

Expand source code

@property
def tags_vocabulary(self) -> List[str]:
    """Retrieves vocabulary of part-of-speech tags for the current index."""
    endpoint = f"{self.address}/api/tags-vocabulary"
    return requests.get(endpoint).json()

Methods

def buildinfo(self) ‑> Dict[str, Union[str, List[str], bool]]

Provides detailed build information about the currently running app.

Expand source code

def buildinfo(self) -> Dict[str, Union[str, List[str], bool]]:
    """Provides detailed build information about the currently running app."""
    endpoint = f"{self.address}/api/buildinfo"
    return requests.get(endpoint).json()

def corpus(self) ‑> CorpusInfo

Provides a summary of the current index

Expand source code

def corpus(self) -> CorpusInfo:
    """Provides a summary of the current index"""
    endpoint = f"{self.address}/api/corpus"
    # return requests.get(endpoint).json()
    return CorpusInfo(**requests.get(endpoint).json())

def delete(self, doc_or_id: Union[Document, Text]) ‑> bool

Removes an OdinsonDocument from the index.

Expand source code

def delete(self, doc_or_id: Union[Document, Text]) -> bool:
    """Removes an OdinsonDocument from the index."""
    doc_id: Text = doc_or_id if isinstance(doc_or_id, Text) else doc_or_id.id
    endpoint = f"{self.address}/api/delete/document/{urllib.parse.quote(doc_id)}"
    res = requests.delete(endpoint)
    return OdinsonBaseAPI.status_code_to_bool(res.status_code)

def document(self, document_id: str) ‑> Document

Retrieves an Odinson Document from the doc store.

Expand source code

def document(self, document_id: str) -> Document:
    """Retrieves an Odinson Document from the doc store."""
    endpoint = f"{self.address}/api/document/{document_id}"
    res = requests.get(endpoint)
    return Document.model_validate(res.json())

def execute_grammar(self, grammar: str, metadata_query: Optional[str] = None, max_docs: Optional[int] = 20, allow_trigger_overlaps: bool = False)

Expand source code

def execute_grammar(
    self,
    grammar: str,
    # A query to filter Documents by their metadata before applying an Odinson pattern.
    metadata_query: Optional[str] = None,
    max_docs: Optional[int] = 20,
    allow_trigger_overlaps: bool = False,
):
    endpoint = f"{self.address}/api/execute/grammar"
    params = {
        "metadataQuery" : metadata_query,
        "maxDocs" : max_docs,
        "allowTriggerOverlaps" : allow_trigger_overlaps
    }
    res = self._post_text(endpoint=endpoint, text=grammar, params=params)
    # return GrammarResults.empty() if res.status_code != 200 else GrammarResults(**res.json())
    # FIXME: check status code and return error or empty results?
    return GrammarResults(**res.json())

def index(self, doc: Document, max_tokens: int = -1) ‑> bool

Indexes a single Document

Expand source code

def index(self, doc: Document, max_tokens: int = -1) -> bool:
    """Indexes a single Document"""
    # endpoint = f"{self.address}/api/index/document"
    endpoint = (
        f"{self.address}/api/index/document/maxTokensPerSentence/{max_tokens}"
    )
    # NOTE: data takes str & .json() returns json str
    headers = {"Content-type": "application/json", "Accept": "text/plain"}
    res = self._post_doc(endpoint=endpoint, doc=doc, headers=headers)
    return OdinsonBaseAPI.status_code_to_bool(res.status_code)

def metadata(self, id: Union[str, int]) ‑> List[Union[lum.odinson.doc.TokensField, lum.odinson.doc.GraphField, lum.odinson.doc.StringField, lum.odinson.doc.DateField, lum.odinson.doc.NumberField, lum.odinson.doc.NestedField]]

Retrieves Odinson Document Metadata from the doc store.

Expand source code

def metadata(self, id: Union[str, int]) -> List[AnyField]:
    """Retrieves Odinson Document Metadata from the doc store."""
    if isinstance(id, str):
        return self.metadata_for_document(id)
    elif isinstance(id, int):
        return self.metadata_for_sentence(id)

def metadata_for_document(self, document_id: str) ‑> List[Union[lum.odinson.doc.TokensField, lum.odinson.doc.GraphField, lum.odinson.doc.StringField, lum.odinson.doc.DateField, lum.odinson.doc.NumberField, lum.odinson.doc.NestedField]]

Retrieves Odinson Document Metadata from the doc store.

Expand source code

def metadata_for_document(self, document_id: str) -> List[AnyField]:
    """Retrieves Odinson Document Metadata from the doc store."""
    endpoint = f"{self.address}/api/metadata/document/{document_id}"
    res = requests.get(endpoint)
    # print(res.json())
    doc = Document.model_validate(
        {"id": document_id, "metadata": res.json(), "sentences": []}
    )
    return doc.metadata

def metadata_for_sentence(self, sentence_id: str) ‑> List[Union[lum.odinson.doc.TokensField, lum.odinson.doc.GraphField, lum.odinson.doc.StringField, lum.odinson.doc.DateField, lum.odinson.doc.NumberField, lum.odinson.doc.NestedField]]

Retrieves Odinson Document Metadata from the doc store.

Expand source code

def metadata_for_sentence(self, sentence_id: str) -> List[AnyField]:
    """Retrieves Odinson Document Metadata from the doc store."""
    endpoint = f"{self.address}/api/metadata/sentence/{sentence_id}"
    res = requests.get(endpoint)
    doc = Document.model_validate(
        {"id": "UNK", "metadata": res.json(), "sentences": []}
    )
    return doc.metadata

def rule_freq(self, grammar: str, allow_trigger_overlaps: bool = False, order: "Literal['freq', 'alpha']" = 'freq', min: int = 0, max: int = 0, scale: "Literal['count', 'log10', 'percent']" = 'count', reverse: bool = False) ‑> List[Statistic]

Expand source code

def rule_freq(
    self,
    # An Odinson grammar.
    grammar: str,
    # Whether or not event arguments are permitted to overlap with the event's trigger. Defaults to false.
    allow_trigger_overlaps: bool = False,
    # The order in which to return results: "freq" (frequency order, default) or "alpha" (alphanumeric order).
    order: Literal["freq", "alpha"] = "freq",
    # The smallest rank to return, with 0 (default) being the highest ranked.
    min: int = 0,
    # The highest rank to return, e.g. 9 (default).
    max: int = 0,
    # Scaling to apply to frequency counts. Choices are "count" (default), "log10", and "percent".
    scale: Literal["count", "log10", "percent"] = "count",
    # Whether to reverse the rank order, to select the 10 lease frequent results, for example.
    reverse: bool = False,
) -> List[Statistic]:
    payload = {
        "grammar": grammar,
        "allowTriggerOverlaps": allow_trigger_overlaps,
        "order": order,
        "min": min,
        "max": max,
        "scale": scale,
        "reverse": reverse,
        "pretty": False,
    }
    endpoint = f"{self.address}/api/rule-freq"
    return requests.post(endpoint, json=payload).json()

def search(self, odinson_query: str, metadata_query: Optional[str] = None, label: Optional[str] = None, commit: bool = False, prev_doc: Optional[int] = None, prev_score: Optional[float] = None) ‑> Iterator[ScoreDoc]

Expand source code

def search(
    self,
    # An Odinson pattern.
    # Example: [lemma=pie] []
    odinson_query: str,
    # A query to filter Documents by their metadata before applying an Odinson pattern.
    metadata_query: Optional[str] = None,
    # The label to use when committing mentions to the State.
    # Example: character contains 'Special Agent'
    label: Optional[str] = None,
    # Whether or not the results of this query should be committed to the State.
    commit: bool = False,
    # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
    prev_doc: Optional[int] = None,
    # The score for the last result seen in the previous page of results.
    prev_score: Optional[float] = None,
) -> Iterator[ScoreDoc]:
    endpoint = f"{self.address}/api/execute/pattern"
    seen = 0
    results: Results = self._search(
        odinson_query=odinson_query,
        metadata_query=metadata_query,
        label=label,
        commit=commit,
        prev_doc=prev_doc,
    )
    total = results.total_hits
    if total == 0:
        return iter(())
    last = results.score_docs[-1]
    while seen < total:
        for sd in results.score_docs:
            seen += 1
            last = sd
            # print(f"{seen-1}/{total}")
            # print(f"sd.document_id:\t{sd.document_id}")
            # print(f"sd.sentence_id:\t{sd.sentence_id}\n")
            # FIXME: should this be a Results() with a single doc?
            yield sd
        # paginate
        results: Results = self._search(
            odinson_query=odinson_query,
            metadata_query=metadata_query,
            label=label,
            commit=commit,
            prev_doc=last.sentence_id,
        )
        # print(f"total_hits:\t{results.total_hits}")

def search_disjunction_of_patterns(self, patterns: list[str], metadata_query: Optional[str] = None, label: Optional[str] = None, prev_doc: Optional[int] = None, prev_score: Optional[float] = None) ‑> Iterator[ScoreDoc]

Expand source code

def search_disjunction_of_patterns(
    self,
    # An Odinson pattern.
    # Example: ["[lemma=pie] []", "[lemma=blarg]"]
    patterns: list[str],
    # A query to filter Documents by their metadata before applying an Odinson pattern.
    metadata_query: Optional[str] = None,
    # The label to use when committing mentions to the State.
    # Example: character contains 'Special Agent'
    label: Optional[str] = None,
    # The ID (sentenceId) for the last document (sentence) seen in the previous page of results.
    prev_doc: Optional[int] = None,
    # The score for the last result seen in the previous page of results.
    prev_score: Optional[float] = None,
) -> Iterator[ScoreDoc]:
    endpoint = f"{self.address}/api/execute/disjunction-of-patterns"

    spr = SimplePatternsRequest(
        patterns=patterns,
        metadataQuery=metadata_query,
        prevDoc=prev_doc,
        prevScore=prev_score,
    )
    results: Results = Results(**requests.post(endpoint, json=spr.dict()).json())

    seen = 0
    total = results.total_hits
    if total == 0:
        return iter(())
    last = results.score_docs[-1]
    while seen < total:
        for sd in results.score_docs:
            seen += 1
            last = sd
            # print(f"{seen-1}/{total}")
            # print(f"sd.document_id:\t{sd.document_id}")
            # print(f"sd.sentence_id:\t{sd.sentence_id}\n")
            # FIXME: should this be a Results() with a single doc?
            yield sd
        # paginate
        nspr = SimplePatternsRequest(
            patterns=patterns,
            metadata_query=metadata_query,
            label=label,
            prev_doc=last.sentence_id,
        )
        results: Results = Results(
            **requests.post(
                endpoint,
                json=nspr.dict(),
            ).json()
        )
        # print(f"total_hits:\t{results.total_hits}")

def sentence(self, sentence_id: int) ‑> lum.odinson.doc.Sentence

Retrieves an Odinson Sentence from the doc store.

Expand source code

def sentence(self, sentence_id: int) -> Sentence:
    """Retrieves an Odinson Sentence from the doc store."""
    endpoint = f"{self.address}/api/sentence/{sentence_id}"
    res = requests.get(endpoint)
    return Sentence.model_validate(res.json())

def term_freq(self) ‑> List[Statistic]

Expand source code

def term_freq(self) -> List[Statistic]:
    pass

def update(self, doc: Document, max_tokens: Optional[int] = None) ‑> bool

Updates an OdinsonDocument in the index, allowing for a specified maximum number of tokens per sentence.

Expand source code

def update(self, doc: Document, max_tokens: Optional[int] = None) -> bool:
    """Updates an OdinsonDocument in the index, allowing for a specified maximum number of tokens per sentence."""
    # f"{self.address}/api/update/document/{urllib.parse.quote(doc.id)}"
    endpoint = (
        f"{self.address}/api/update/document"
        if not max_tokens
        else f"{self.address}/api/update/document/maxTokensPerSentence/{max_tokens}"
    )
    res = self._post_doc(endpoint=endpoint, doc=doc)
    return OdinsonBaseAPI.status_code_to_bool(res.status_code)

def validate_document(self, doc: Document, strict: bool = True) ‑> bool

Inspects and validates an OdinsonDocument

Expand source code

def validate_document(self, doc: Document, strict: bool = True) -> bool:
    """Inspects and validates an OdinsonDocument"""
    endpoint = (
        f"{self.address}/api/validate/document/strict"
        if strict
        else f"{self.address}/api/validate/document/relaxed"
    )
    res = self._post_doc(endpoint=endpoint, doc=doc)
    return OdinsonBaseAPI.status_code_to_bool(res.status_code)

def validate_grammar(self, grammar: str, verbose: bool = False) ‑> Union[bool, OdinsonErrors]

Inspects and validates an Odinson grammar

Expand source code

def validate_grammar(
    self, grammar: str, verbose: bool = False
) -> Union[bool, OdinsonErrors]:
    """Inspects and validates an Odinson grammar"""
    endpoint = f"{self.address}/api/validate/grammar"
    res = self._post_text(endpoint=endpoint, text=grammar)
    if res.status_code == 200:
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)
    else:
        return False if not verbose else OdinsonErrors.model_validate(res.json())

def validate_rule(self, rule: str, verbose: bool = False) ‑> Union[bool, OdinsonErrors]

Inspects and validates an Odinson rule

Expand source code

def validate_rule(
    self, rule: str, verbose: bool = False
) -> Union[bool, OdinsonErrors]:
    """Inspects and validates an Odinson rule"""
    endpoint = f"{self.address}/api/validate/rule"
    res = self._post_text(endpoint=endpoint, text=rule)
    if res.status_code == 200:
        return OdinsonBaseAPI.status_code_to_bool(res.status_code)
    else:
        return False if not verbose else OdinsonErrors.model_validate(res.json())