Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def separate_token_with_dicrites(token):
"""gets a token(string) with taskeel, and returns a list of strings,
each string in the list represents each character in the token with its own tashkeel.
Args:
token (str): string represents a word or aya or sura
Returns:
[str]: a list contains the token characters with their tashkeel.
"""
token_without_tatweel = araby.strip_tatweel(token)
print(token_without_tatweel)
hroof_with_tashkeel = []
for index,i in enumerate(token):
if((token[index] in (alphabet or alefat or hamzat) )):
k = index
harf_with_taskeel =token[index]
while((k+1) != len(token) and (token[k+1] in (tashkeel or harakat or shortharakat or tanwin ))):
harf_with_taskeel =harf_with_taskeel+""+token[k+1]
k = k + 1
index = k
hroof_with_tashkeel.append(harf_with_taskeel)
return hroof_with_tashkeel
@rtype: Boolean
"""
if not word:
return True
if word.isdigit():
return True
for c in word:
if c in string.punctuation:
return True
# test if the word is previouslly spelled
# can get True or False
if word in self.worddict:
test = self.worddict.get(word, False)
else:
# if the word is not spelled
word = araby.strip_tatweel(word)
self.stemmer.segment(word)
# extract the affix
stem = self.stemmer.get_stem()
affix = u"-".join([self.stemmer.get_prefix(), self.stemmer.get_suffix()])
# lookup in the database
test = self.database.lookup(word, stem, affix)
self.worddict[word] = test
return test
def check_word(self, word, guessedtag=""):
"""
Analyze one word morphologically as verbs
@param word: the input word.
@type word: unicode.
@return: list of dictionaries of analyzed words with tags.
@rtype: list.
"""
word = araby.strip_tatweel(word)
word_vocalised = word
word_nm = araby.strip_tashkeel(word)
# get analysed details from cache if used
if self.allow_cache_use and self.cache.is_already_checked(word_nm):
#~ print (u"'%s'"%word).encode('utf8'), 'found'
resulted_data = self.cache.get_checked(word_nm)
else:
resulted_data = []
# if word is a pounctuation
resulted_data += self.check_word_as_pounct(word_nm)
# Done: if the word is a stop word we have some problems,
# the stop word can also be another normal word (verb or noun),
# we must consider it in future works
# if word is stopword allow stop words analysis
resulted_data += self.check_word_as_stopword(word_nm)
def clean(self, strng):
"""
clean a string from unnecessary whitespaces
"""
#if type(strng) == str or type(strng) == unicode:
if type(strng) == str:#python3
strng = araby.strip_tatweel(strng)
return re.sub(u'\s+', ' ', strng).strip()
if type(strng) == list:
l= [re.sub(u'\s+', ' ', s).strip() for s in strng]
return [araby.strip_tatweel(s) for s in l]
else:
return strng
"""Grouping each letter with its diacritics.
Args:
sentance: str
Returns:
[str]: a list of _x_, where _x_ is the letter accompanied with its
diacritics.
Example:
```python
q.grouping_letter_diacritics('إِنَّا أَعْطَيْنَكَ الْكَوْثَرَ')\n
>>> ['إِ', 'نَّ', 'ا', ' ', 'أَ', 'عْ', 'طَ', 'يْ', 'نَ', 'كَ', ' ', 'ا', 'لْ', 'كَ', 'وْ', 'ثَ', 'رَ']
```
"""
sentance_without_tatweel = strip_tatweel(sentance)
print(sentance_without_tatweel)
hroof_with_tashkeel = []
for index,i in enumerate(sentance):
if((sentance[index] in (alphabet or alefat or hamzat)or sentance[index] is ' ' )):
k = index
harf_with_taskeel =sentance[index]
while((k+1) != len(sentance) and (sentance[k+1] in (tashkeel or harakat or shortharakat or tanwin ))):
harf_with_taskeel =harf_with_taskeel+""+sentance[k+1]
k = k + 1
index = k
hroof_with_tashkeel.append(harf_with_taskeel)
return hroof_with_tashkeel
def clean(self, strng):
"""
clean a string from unnecessary whitespaces
"""
#if type(strng) == str or type(strng) == unicode:
if type(strng) == str:#python3
strng = araby.strip_tatweel(strng)
return re.sub(u'\s+', ' ', strng).strip()
if type(strng) == list:
l= [re.sub(u'\s+', ' ', s).strip() for s in strng]
return [araby.strip_tatweel(s) for s in l]
else:
return strng
Returns:
str : zero and ones for each token
'''
marksDictionary = {'ْ': 0, '': 0, 'ُ': 1, 'َ': 1, 'ِ': 1, 'ّ': 1, 'ٌ': 1, 'ً': 1, 'ٍ': 1}
charWithOutTashkeelOrSukun = ''
tashkeelPatternList = [] # list of zeros and ones
marksList = []
# convert the List o to string without spaces
ayahModified = ''.join(ayah.strip())
tashkeelPatternStringWithSpace = ''
# check is there a tatweel in ayah or not
if(tatweel in ayahModified):
ayahModified = strip_tatweel(ayahModified)
# check whether exist alef_mad in ayah if exist unpack the alef mad
if (alef_mad in ayahModified):
ayahModified = unpack_alef_mad(ayahModified)
# separate tashkeel from the ayah
ayahOrAyatWithoutTashkeel, marks = separate(ayahModified)
for mark in marks:
#the pyarabic returns the char of marks without tashkeel with 'ـ' so if check about this mark if not exist
#append in list harakat and zero or ones in tashkeel pattern list if yes append the marks and patterns
if (mark != 'ـ'):
marksList.append(mark)
tashkeelPatternList.append(marksDictionary[mark])
else: