Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
import re
from nltk.tokenize import RegexpTokenizer
from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\
simplify_alpino_tag, simplify_indian_tag,\
simplify_tag
from .util import LazyCorpusLoader
from .reader import *
abc = LazyCorpusLoader(
'abc', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
('science', 'latin_1'),
('rural', 'utf8')])
alpino = LazyCorpusLoader(
'alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag)
brown = LazyCorpusLoader(
'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
cat_file='cats.txt', tag_mapping_function=simplify_brown_tag,
encoding="ascii")
cess_cat = LazyCorpusLoader(
'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cess_esp = LazyCorpusLoader(
'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cmudict = LazyCorpusLoader(
'cmudict', CMUDictCorpusReader, ['cmudict'])
comtrans = LazyCorpusLoader(
'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
conll2000 = LazyCorpusLoader(
ieer = LazyCorpusLoader(
'ieer', IEERCorpusReader, r'(?!README|\.).*')
inaugural = LazyCorpusLoader(
'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
# [XX] This should probably just use TaggedCorpusReader:
indian = LazyCorpusLoader(
'indian', IndianCorpusReader, r'(?!\.).*\.pos',
tag_mapping_function=simplify_indian_tag,
encoding='utf8')
ipipan = LazyCorpusLoader(
'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
jeita = LazyCorpusLoader(
'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
knbc = LazyCorpusLoader(
'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
lin_thesaurus = LazyCorpusLoader(
'lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
mac_morpho = LazyCorpusLoader(
'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt',
tag_mapping_function=simplify_tag, encoding='latin-1')
machado = LazyCorpusLoader(
'machado', PortugueseCategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
movie_reviews = LazyCorpusLoader(
'movie_reviews', CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
encoding='ascii')
names = LazyCorpusLoader(
'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
nps_chat = LazyCorpusLoader(
'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml',
tag_mapping_function=simplify_wsj_tag)
tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cess_esp = LazyCorpusLoader(
'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cmudict = LazyCorpusLoader(
'cmudict', CMUDictCorpusReader, ['cmudict'])
comtrans = LazyCorpusLoader(
'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
conll2000 = LazyCorpusLoader(
'conll2000', ConllChunkCorpusReader,
['train.txt', 'test.txt'], ('NP','VP','PP'),
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
conll2002 = LazyCorpusLoader(
'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
conll2007 = LazyCorpusLoader(
'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
('eus', 'ISO-8859-2'),
('esp', 'utf8')])
dependency_treebank = LazyCorpusLoader(
'dependency_treebank', DependencyCorpusReader, '.*\.dp',
encoding='ascii')
floresta = LazyCorpusLoader(
'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
tag_mapping_function=simplify_tag, encoding='ISO-8859-15')
framenet = LazyCorpusLoader(
'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
gazetteers = LazyCorpusLoader(
'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt',
encoding='ISO-8859-2')
genesis = LazyCorpusLoader(
'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
cmudict = LazyCorpusLoader(
'cmudict', CMUDictCorpusReader, ['cmudict'])
comtrans = LazyCorpusLoader(
'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
conll2000 = LazyCorpusLoader(
'conll2000', ConllChunkCorpusReader,
['train.txt', 'test.txt'], ('NP','VP','PP'),
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
conll2002 = LazyCorpusLoader(
'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
conll2007 = LazyCorpusLoader(
'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
('eus', 'ISO-8859-2'),
('esp', 'utf8')])
dependency_treebank = LazyCorpusLoader(
'dependency_treebank', DependencyCorpusReader, '.*\.dp',
encoding='ascii')
floresta = LazyCorpusLoader(
'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
tag_mapping_function=simplify_tag, encoding='ISO-8859-15')
framenet = LazyCorpusLoader(
'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
gazetteers = LazyCorpusLoader(
'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt',
encoding='ISO-8859-2')
genesis = LazyCorpusLoader(
'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
('finnish|french|german', 'latin_1'),
('swedish', 'cp865'),
('.*', 'utf_8')])
gutenberg = LazyCorpusLoader(
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
treebank_chunk = LazyCorpusLoader(
'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader(
'wordnet', WordNetCorpusReader)
wordnet_ic = LazyCorpusLoader(
'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
ycoe = LazyCorpusLoader(
'ycoe', YCOECorpusReader)
# defined after treebank
propbank = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
'prop.txt', 'frames/.*\.xml', 'verbs.txt',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
'nombank.1.0', NombankCorpusReader,
'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank) # Must be defined *after* treebank corpus.
state_union = LazyCorpusLoader(
'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt',
encoding='ISO-8859-2')
stopwords = LazyCorpusLoader(
'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
swadesh = LazyCorpusLoader(
'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
switchboard = LazyCorpusLoader(
'switchboard', SwitchboardCorpusReader,
tag_mapping_function=simplify_wsj_tag)
timit = LazyCorpusLoader(
'timit', TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
'timit', TimitTaggedCorpusReader, '.+\.tags',
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
toolbox = LazyCorpusLoader(
'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
treebank = LazyCorpusLoader(
'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
treebank_chunk = LazyCorpusLoader(
'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader(
'wordnet', WordNetCorpusReader)
wordnet_ic = LazyCorpusLoader(
'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
ycoe = LazyCorpusLoader(
'ycoe', YCOECorpusReader)
# defined after treebank
propbank = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
'prop.txt', 'frames/.*\.xml', 'verbs.txt',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
'nombank.1.0', NombankCorpusReader,
'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
'prop.txt', 'frames/.*\.xml', 'verbs.txt',
lambda filename: filename.upper(),
shakespeare = LazyCorpusLoader(
'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
sinica_treebank = LazyCorpusLoader(
'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'],
tag_mapping_function=simplify_tag, encoding='utf-8')
state_union = LazyCorpusLoader(
'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt',
encoding='ISO-8859-2')
stopwords = LazyCorpusLoader(
'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
swadesh = LazyCorpusLoader(
'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
switchboard = LazyCorpusLoader(
'switchboard', SwitchboardCorpusReader,
tag_mapping_function=simplify_wsj_tag)
timit = LazyCorpusLoader(
'timit', TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
'timit', TimitTaggedCorpusReader, '.+\.tags',
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
toolbox = LazyCorpusLoader(
'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
treebank = LazyCorpusLoader(
'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
treebank_chunk = LazyCorpusLoader(
'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words()))
The, Fulton, County, Grand, Jury, said, ...
"""
import re
from nltk.tokenize import RegexpTokenizer
from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\
simplify_alpino_tag, simplify_indian_tag,\
simplify_tag
from .util import LazyCorpusLoader
from .reader import *
abc = LazyCorpusLoader(
'abc', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
('science', 'latin_1'),
('rural', 'utf8')])
alpino = LazyCorpusLoader(
'alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag)
brown = LazyCorpusLoader(
'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
cat_file='cats.txt', tag_mapping_function=simplify_brown_tag,
encoding="ascii")
cess_cat = LazyCorpusLoader(
'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cess_esp = LazyCorpusLoader(
'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cmudict = LazyCorpusLoader(
udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader(
'wordnet', WordNetCorpusReader)
wordnet_ic = LazyCorpusLoader(
'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
ycoe = LazyCorpusLoader(
'ycoe', YCOECorpusReader)
# defined after treebank
propbank = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
'prop.txt', 'frames/.*\.xml', 'verbs.txt',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
'nombank.1.0', NombankCorpusReader,
'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
'prop.txt', 'frames/.*\.xml', 'verbs.txt',
lambda filename: filename.upper(),
ptb) # Must be defined *after* ptb corpus.
nombank_ptb = LazyCorpusLoader(
'nombank.1.0', NombankCorpusReader,