Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#! /usr/bin/python
import sys
sys.path.append('lib');
import re
import string
import datetime
import getopt
import os
import pyarabic.araby as araby
import qalsadi.analex
scriptname = os.path.splitext(os.path.basename(sys.argv[0]))[0]
scriptversion = '0.1'
AuthorName="Taha Zerrouki"
#Token_pattern=re.compile(u"([\w%s ]+)"%(u"".join(araby.TASHKEEL),),re.UNICODE);
Clause_pattern=re.compile(u"([\w%s\s]+)"%(u"".join(araby.TASHKEEL),),re.UNICODE);
#Token_pattern=re.compile(u"([^\w]+)",re.UNICODE);
# Token_pattern=re.compile(u"([^\w%s\s])+"%(u"".join(araby.TASHKEEL),),re.UNICODE);
def phraseSplit(text):
"""
Split Text into clauses
@param text: input text;
@type text: unicode;
@return: list of clauses
@rtype: list of unicode
"""
if text:
list_phrase = Clause_pattern.split(text);
if list_phrase:
j =- 1;
def separate_token_with_dicrites(token):
"""gets a token(string) with taskeel, and returns a list of strings,
each string in the list represents each character in the token with its own tashkeel.
Args:
token (str): string represents a word or aya or sura
Returns:
[str]: a list contains the token characters with their tashkeel.
"""
token_without_tatweel = araby.strip_tatweel(token)
print(token_without_tatweel)
hroof_with_tashkeel = []
for index,i in enumerate(token):
if((token[index] in (alphabet or alefat or hamzat) )):
k = index
harf_with_taskeel =token[index]
while((k+1) != len(token) and (token[k+1] in (tashkeel or harakat or shortharakat or tanwin ))):
harf_with_taskeel =harf_with_taskeel+""+token[k+1]
k = k + 1
index = k
hroof_with_tashkeel.append(harf_with_taskeel)
return hroof_with_tashkeel
def main(args):
generator = alyahmor.genelex.genelex()
print ('NOUN_AFFIX_LIST=')
noun_affixes = generator.generate_affix_list(word_type="noun", vocalized=False)
print(arepr(noun_affixes).replace(',', ',\n'))
print('VERB_AFFIX_LIST=')
verb_affixes = generator.generate_affix_list(word_type="verb", vocalized=False)
print(arepr(verb_affixes).replace('],', '],\n'))
print(arepr(verb_affixes).replace(',', ',\n'))
return 0
if __name__ == '__main__':
def main(args):
generator = alyahmor.genelex.genelex()
print ('NOUN_AFFIX_LIST=')
noun_affixes = generator.generate_affix_list(word_type="noun", vocalized=False)
print(arepr(noun_affixes).replace(',', ',\n'))
print('VERB_AFFIX_LIST=')
verb_affixes = generator.generate_affix_list(word_type="verb", vocalized=False)
print(arepr(verb_affixes).replace('],', '],\n'))
print(arepr(verb_affixes).replace(',', ',\n'))
return 0
if __name__ == '__main__':
def main(args):
generator = alyahmor.genelex.genelex()
print ('NOUN_AFFIX_LIST=')
noun_affixes = generator.generate_affix_list(word_type="noun", vocalized=False)
print(arepr(noun_affixes).replace(',', ',\n'))
print('VERB_AFFIX_LIST=')
verb_affixes = generator.generate_affix_list(word_type="verb", vocalized=False)
print(arepr(verb_affixes).replace('],', '],\n'))
print(arepr(verb_affixes).replace(',', ',\n'))
return 0
if __name__ == '__main__':
#~ print(u"\n".join((unv_forms)).encode('utf8'))
voc_forms = generator.get_vocalized_forms(list_forms)
#~ print(u"\n".join((voc_forms)).encode('utf8'))
voc_forms_dict = generator.get_vocalized_forms_dict(list_forms)
print(arepr(voc_forms_dict).replace('],', '],\n'))
if wtype == "verb":
print('************verb*****')
list_forms =generator.generate_verb_forms(word)
#~ print(arepr(verb_forms).replace('),', '),\n').replace('],', '],\n'))
unv_forms = generator.get_unvocalized_forms(list_forms)
#~ print(u"\n".join((unv_forms)).encode('utf8'))
voc_forms = generator.get_vocalized_forms(list_forms)
#~ print(u"\n".join((voc_forms)).encode('utf8'))
voc_forms_dict = generator.get_vocalized_forms_dict(list_forms[:10])
print(arepr(voc_forms_dict).replace('],', '],\n'))
if not noun_tuple or not noun_tuple.get('vocalized',''):
return ""
nb = 0
prefix_table =[]
suffix_table =[]
stem_table = []
flags_table ={}
for procletic, encletic, suffix in self.affixes_list:
affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
+snconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \
+snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']
#test if the given word from dictionary accept those
# tags given by affixes
# دراسة توافق الزوائد مع خصائص الاسم،
# مثلا هل يقبل الاسم التأنيث.
suffix_nm = araby.strip_tashkeel(suffix)
encletic_nm = araby.strip_tashkeel(encletic)
if nspell.validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_nm):
if nspell.is_compatible_proaffix_affix(noun_tuple, procletic, encletic, suffix):
vocalized, semi_vocalized, segmented = nspell.vocalize(noun_tuple['vocalized'], procletic, suffix, encletic)
if VERIFY_INPUT:
print (u"\t".join([ segmented, vocalized])).encode('utf8')
tags = self.get_tags(noun_tuple, affix_tags)
print (u"\t".join([ araby.strip_tashkeel(vocalized), noun_tuple['unvocalized'], tags])).encode('utf8')
print ("*" + u"\t".join([ araby.strip_tashkeel(vocalized), noun_tuple['unvocalized'], u','.join(affix_tags)])).encode('utf8')
nb += 1
listfields = segmented.split('-')
if len(listfields) == 4:
pref = listfields[0]
stem = listfields[1]
def hellper_get_sequance_positions(verse,sequance):
verse = strip_tashkeel(verse)
sequance = strip_tashkeel(sequance)
sequance = sequance.split()
verse = verse.split()
positions = []
for n,v in enumerate(verse):
if v not in sequance:
continue
for en,se in enumerate(sequance):
if se != verse[n]:
break
if en == len(sequance)-1:
positions.append(n)
n+=1
return positions
"""
UNVOCALIZED TEXT NOT NULL,
PROCLETIC TEXT,
TAGS TEXT,
VOCALIZED TEXT,
STEM TEXT,
TYPE TEXT,
ORIGINAL TEXT,
ENCLETIC TEXT
"""
#~ print(tuple_table[conj])
stemmed, tags = conj
result_fields['stemmed'] = stemmed
result_fields['vocalized'] = ar_stopwords.standardize_form(result_fields['stemmed']);
result_fields['word'] = ar_stopwords.standardize_form(result_fields['stemmed']);
result_fields['standard'] = araby.strip_tashkeel(result_fields['vocalized']);
parts = stemmed.split(';')
if len(parts)>=3:
result_fields['procletic'] = parts[0]
result_fields['stem'] = parts[1]
result_fields['encletic'] = parts[2]
result_fields['tags'] = tags #fields.get("tags", 'tags')
result_fields['unvocalized'] = result_fields['standard']
fields_table.append(result_fields)
return fields_table
>>> detectNamedPosition(u"قال خالد بن رافع حدثني أحمد بن عنبر عن خاله");
((1,3), (6,8))
"""
wordlist#=text.split(u' ');
#print words;
positions = [];
startNamed =-1;
endNamed =False;
# print u":".join(wordlist).encode('utf8');
for i in range(len(wordlist)):
word=wordlist[i];
if i+1=0:
previous=araby.stripTashkeel(wordlist[i-1]);
if previous and startNamed<0 and previous[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
previous=previous[1:];
else: previous = u''
#save the original word with possible harakat if exist
word_nm=araby.stripTashkeel(word);
key=word_nm;
# the first word can have prefixes
if word_nm and startNamed<0 and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
key=word_nm[1:];
if startNamed<0 and key in (u'ابن', ):
startNamed=i;
endNamed=i
elif key in (u'ابن', u'بن',u'أبو',u'أبا', u'أبي', u'عبد' , u'عبيد' , u'بنو', u'بني', u'بنت'):
if startNamed<0:
startNamed=i;