Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not noun_tuple or not noun_tuple.get('vocalized',''):
return ""
nb = 0
prefix_table =[]
suffix_table =[]
stem_table = []
flags_table ={}
for procletic, encletic, suffix in self.affixes_list:
affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
+snconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \
+snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']
#test if the given word from dictionary accept those
# tags given by affixes
# دراسة توافق الزوائد مع خصائص الاسم،
# مثلا هل يقبل الاسم التأنيث.
suffix_nm = araby.strip_tashkeel(suffix)
encletic_nm = araby.strip_tashkeel(encletic)
if nspell.validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_nm):
if nspell.is_compatible_proaffix_affix(noun_tuple, procletic, encletic, suffix):
vocalized, semi_vocalized, segmented = nspell.vocalize(noun_tuple['vocalized'], procletic, suffix, encletic)
if VERIFY_INPUT:
print (u"\t".join([ segmented, vocalized])).encode('utf8')
tags = self.get_tags(noun_tuple, affix_tags)
print (u"\t".join([ araby.strip_tashkeel(vocalized), noun_tuple['unvocalized'], tags])).encode('utf8')
print ("*" + u"\t".join([ araby.strip_tashkeel(vocalized), noun_tuple['unvocalized'], u','.join(affix_tags)])).encode('utf8')
nb += 1
listfields = segmented.split('-')
if len(listfields) == 4:
pref = listfields[0]
stem = listfields[1]
def hellper_get_sequance_positions(verse,sequance):
verse = strip_tashkeel(verse)
sequance = strip_tashkeel(sequance)
sequance = sequance.split()
verse = verse.split()
positions = []
for n,v in enumerate(verse):
if v not in sequance:
continue
for en,se in enumerate(sequance):
if se != verse[n]:
break
if en == len(sequance)-1:
positions.append(n)
n+=1
return positions
"""
UNVOCALIZED TEXT NOT NULL,
PROCLETIC TEXT,
TAGS TEXT,
VOCALIZED TEXT,
STEM TEXT,
TYPE TEXT,
ORIGINAL TEXT,
ENCLETIC TEXT
"""
#~ print(tuple_table[conj])
stemmed, tags = conj
result_fields['stemmed'] = stemmed
result_fields['vocalized'] = ar_stopwords.standardize_form(result_fields['stemmed']);
result_fields['word'] = ar_stopwords.standardize_form(result_fields['stemmed']);
result_fields['standard'] = araby.strip_tashkeel(result_fields['vocalized']);
parts = stemmed.split(';')
if len(parts)>=3:
result_fields['procletic'] = parts[0]
result_fields['stem'] = parts[1]
result_fields['encletic'] = parts[2]
result_fields['tags'] = tags #fields.get("tags", 'tags')
result_fields['unvocalized'] = result_fields['standard']
fields_table.append(result_fields)
return fields_table
def get_unvoriginal(self, ):
"""
Get the unvocalized original form of the input word
@return: the given unvocalized original.
@rtype: unicode string
"""
if self.unvoriginal:
return self.unvoriginal
else :
if self.original:
self.unvoriginal = araby.strip_tashkeel(self.original)
else:
return u""
return self.unvoriginal
def get_vocalized_affixes_dict(self, forms = []):
""" display vocalized affixes in a dict"""
forms_dict = {}
if forms:
for form in forms:
unvoc = araby.strip_tashkeel(form)
if unvoc in forms_dict:
forms_dict[unvoc].append(form)
else:
forms_dict[unvoc] = [form,]
for key in forms_dict:
if len(forms_dict[key])>=2:
forms_dict[key].sort()
forms_dict[key] = list(set(forms_dict[key]))
return forms_dict
def add(self, word):
"""
add a new vocalization given by user for unrecongnized word
@return: vocalized word
@rtype: none
"""
word_nm = araby.strip_tashkeel(word)
if word_nm not in self.dictio:
self.dictio[word_nm] = [word, ]
else:
if word not in self.dictio[word_nm]:
self.dictio[word_nm].append(word)
try:
self.cdfile = open(self.filename, "a+")
text = u"%s\t%s\n"%(word_nm, u':'.join(self.dictio[word_nm]))
self.cdfile.write(text.encode('utf8'))
self.cdfile.close()
except:
print "updating:can't update cutom dictionary'"
def __del__(self,):
the given word, and give ذئب.
@param word_vocalised: the input word.
@type word_vocalised: unicode.
@param resulted_data: the founded resulat from dictionary.
@type resulted_data: list of dict.
@return: list of dictionaries of analyzed words with tags.
@rtype: list.
"""
#print word_vocalised.encode('utf8')
filtred_data = []
inputword = araby.strip_tashkeel(word_vocalised)
for item in resulted_data:
if 'vocalized' in item.__dict__: #.has_key('vocalized') :
#~ if 'vocalized' in item :
#~ outputword = araby.strip_tashkeel(item['vocalized'])
outputword = araby.strip_tashkeel(item.__dict__['vocalized'])
#~ print u'\t'.join([inputword, outputword]).encode('utf8')
if inputword == outputword:
#item['tags'] += ':a'
filtred_data.append(item)
#~ filtred_data.append(item)
return filtred_data
flags += svconst.TabSuffixes[pronoun]['full'];
# add flag yeh for the الأفعال الخمسة
if tense == const.TenseFuture and pronoun in (const.PronounAnti, const.PronounAntuma, const.PronounAntuma_f,
const.PronounAntum, const.PronounHuma, const.PronounHuma_f, const.PronounHum ):
flags+=u"Ha";
# add double object suffixe, if the verb is double transitive, and the tense is indicative
if v['double_trans'] and tense in const.TableIndicativeTense:
# add flags for suffixes (double object)
flags += svconst.TabDisplayTagDouble[pronoun]['full'];
#add an entree to the table entrie
# this allows to reduce many cases into one entree
word_nm = araby.strip_tashkeel(conjugTable[tense][pronoun]);
if TableEntries.has_key(word_nm):
TableEntries[word_nm] += flags;
else:
TableEntries[word_nm] = flags;
#print (u'%s/%s\t%s%s'%(ar_strip_marks(conjugTable[tense][pronoun]), flags, word,verb_cat)).encode('utf8');
# print element from the TableEntries
for key in TableEntries.keys():
if key!="":
line +=u'%s/%s\n'%(key, vspell.unify_flags(TableEntries[key]))
return line
to treat some normalized cases,
the analyzer return the vocalized like words
ُIf the word is ذئب, the normalized form is ذءب,
which can give from dictionary ذئبـ ذؤب.
this function filter normalized resulted word according
the given word, and give ذئب.
@param word_vocalised: the input word.
@type word_vocalised: unicode.
@param resulted_data: the founded resulat from dictionary.
@type resulted_data: list of dict.
@return: list of dictionaries of analyzed words with tags.
@rtype: list.
"""
#print word_vocalised.encode('utf8')
filtred_data = []
inputword = araby.strip_tashkeel(word_vocalised)
for item in resulted_data:
if 'vocalized' in item.__dict__: #.has_key('vocalized') :
#~ if 'vocalized' in item :
#~ outputword = araby.strip_tashkeel(item['vocalized'])
outputword = araby.strip_tashkeel(item.__dict__['vocalized'])
#~ print u'\t'.join([inputword, outputword]).encode('utf8')
if inputword == outputword:
#item['tags'] += ':a'
filtred_data.append(item)
#~ filtred_data.append(item)
return filtred_data