Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from forte.data import DataPack
from forte.processors.base import PackProcessor
from ft.onto.base_ontology import Sentence
__all__ = [
"PeriodSentenceSegmenter"
]
class PeriodSentenceSegmenter(PackProcessor):
"""
A dummy sentence segmenter which segments sentence only based on periods.
Used for unit tests.
"""
def _process(self, input_pack: DataPack):
# pylint: disable=no-self-use
text = input_pack.text
begin_pos = 0
while begin_pos < len(text):
end_pos = min(text.find('.', begin_pos))
if end_pos == -1:
end_pos = len(text) - 1
sentence_entry = Sentence(input_pack, begin_pos, end_pos + 1)
input_pack.add_or_get_entry(sentence_entry)
for i in range(len(token_texts))]
for token, lemma in zip(token_entries, lemmas):
token.lemma = lemma
def penn2morphy(penntag: str) -> str:
r"""Converts tags from Penn format to Morphy.
"""
morphy_tag = {'NN': 'n', 'JJ': 'a', 'VB': 'v', 'RB': 'r'}
if penntag[:2] in morphy_tag:
return morphy_tag[penntag[:2]]
else:
return 'n'
class NLTKChunker(PackProcessor):
r"""A wrapper of NLTK chunker.
"""
def __init__(self):
super().__init__()
self.chunker = None
# pylint: disable=unused-argument
def initialize(self, resources: Resources, configs: Config):
super().initialize(resources, configs)
self.chunker = RegexpParser(configs.pattern)
@classmethod
def default_configs(cls):
r"""This defines a basic config structure for NLTKChunker.
"""
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict
from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor
__all__ = [
"AttributeMasker"
]
class AttributeMasker(PackProcessor):
# pylint: disable=attribute-defined-outside-init
def initialize(self, _: Resources, config: Config):
self.fields = config.kwargs
@classmethod
def default_configs(cls) -> Dict[str, Any]:
r"""Default config for this processor.
Example usage is shown below
.. code-block:: python
{
"kwargs": {
Token: ["ner"]
}
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The main running pipeline for the rewriter.
"""
from examples.content_rewriter.reader import TableReader
from forte.data.data_pack import DataPack
from forte.pipeline import Pipeline
from forte.processors.base import PackProcessor
from forte.processors.writers import PackNameJsonPackWriter
from ft.onto.base_ontology import Utterance
class Instructor(PackProcessor):
def __init__(self, instruction: str):
super().__init__()
self.instruction = instruction
def _process(self, input_pack: DataPack):
input_pack.set_text(input_pack.text + '\n' + self.instruction)
u = Utterance(input_pack,
len(input_pack.text) - len(self.instruction),
len(input_pack.text))
u.speaker = 'ai'
instruct_text = 'This is an example to use the chatbot interface with the ' \
'content rewriter model. To run this example, follow the ' \
'instructions here "https://github.com/asyml/forte' \
'/tree/master/examples/content_rewriter" to obtain ' \
"""
def __init__(self):
super().__init__()
self.token_component = None
def _process(self, input_pack: DataPack):
token_entries = list(input_pack.get(entry_type=Token,
components=self.token_component))
token_texts = [token.text for token in token_entries]
taggings = pos_tag(token_texts)
for token, tag in zip(token_entries, taggings):
token.pos = tag[1]
class NLTKLemmatizer(PackProcessor):
r"""A wrapper of NLTK lemmatizer.
"""
def __init__(self):
super().__init__()
self.token_component = None
self.lemmatizer = WordNetLemmatizer()
def _process(self, input_pack: DataPack):
token_entries: List[Token] = list(input_pack.get(
entry_type=Token, components=self.token_component))
token_texts: List[str] = []
token_poses: List[str] = []
for token in token_entries:
token_texts.append(token.text)
if hasattr(chunk, 'label'):
# For example:
# chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')])
begin_pos = token_entries[index].span.begin
end_pos = token_entries[index + len(chunk) - 1].span.end
phrase = Phrase(input_pack, begin_pos, end_pos)
phrase.phrase_type = chunk.label()
index += len(chunk)
else:
# For example:
# chunk: ('is', 'VBZ')
index += 1
class NLTKSentenceSegmenter(PackProcessor):
r"""A wrapper of NLTK sentence tokenizer.
"""
def __init__(self):
super().__init__()
self.sent_splitter = PunktSentenceTokenizer()
def _process(self, input_pack: DataPack):
for begin, end in self.sent_splitter.span_tokenize(input_pack.text):
Sentence(input_pack, begin, end)
class NLTKNER(PackProcessor):
r"""A wrapper of NLTK NER.
"""
from nltk.tokenize import word_tokenize
from forte.data import DataPack
from forte.data.ontology import base_ontology
from forte.processors.base import PackProcessor, ProcessInfo
__all__ = [
"NLTKWordTokenizer",
]
class NLTKWordTokenizer(PackProcessor):
"""
A wrapper of NLTK word tokenizer.
"""
def __init__(self):
super().__init__()
self.sentence_component = None
self._ontology = base_ontology
def _define_input_info(self) -> ProcessInfo:
input_info: ProcessInfo = {
self._ontology.Sentence: ["span"]
}
return input_info
def _define_output_info(self) -> ProcessInfo:
output_info: ProcessInfo = {
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor
__all__ = [
"LowerCaserProcessor",
]
class LowerCaserProcessor(PackProcessor):
def _process(self, input_pack: DataPack):
input_pack.set_text(input_pack.text.lower())
from spacy.language import Language
from spacy.cli.download import download
from forte.common import ProcessExecutionException
from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor
from ft.onto.base_ontology import EntityMention, Sentence, Token
__all__ = [
"SpacyProcessor",
]
class SpacyProcessor(PackProcessor):
"""
A wrapper for spaCy processors
"""
def __init__(self):
super().__init__()
self.processors: str = ""
self.nlp: Optional[Language] = None
self.lang_model: str = ''
def set_up(self):
try:
self.nlp = spacy.load(self.lang_model)
except OSError:
download(self.lang_model)
self.nlp = spacy.load(self.lang_model)
import texar.torch as tx
import torch
from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.data.ontology.top import Annotation
from forte.processors.base import PackProcessor
from forte.utils.utils import get_class
__all__ = [
"PretrainedEncoder",
]
class PretrainedEncoder(PackProcessor):
r"""A wrapper of Texar pre-trained encoders.
This processor will compute the embedding vectors for entries of type
``Annotation`` using pre-trained models. The user can specify the
pre-trained model type and the annotation class name via configuration.
For the full list of pre-trained models supported, see
:meth:`default_config` for more details. The processor will add embedding
vector for all entries matching the specified entry type. The resulting
vector can be accessed by the embedding field of the annotations.
"""
def __init__(self):
super().__init__()
self.tokenizer = None
self.encoder = None
self.entry_type = None