Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
import re
import time
import pyarabic.araby as araby
STAMP_PAT = re.compile(u"[%s%s%s%s%s%s%s%s%s%s]"% (araby.ALEF,
araby.YEH, araby.HAMZA, araby.ALEF_HAMZA_ABOVE, araby.WAW_HAMZA,
araby.YEH_HAMZA, araby.WAW, araby.ALEF_MAKSURA, araby.ALEF_MADDA, araby.SHADDA),
re.UNICODE)
def decode_tenses(field):
"""
Decode tenses field
"""
all=False;
past=False;
future=False;
passive=False;
imperative=False;
future_moode=False;
confirmed=False;
if field==u"يعملان":
all=True;
else:
# table of conversion if التقاء الساكنين
CONVERSION_TABLE = {
ALEF_YEH_HARAKA: KASRA,
ALEF_WAW_HARAKA: DAMMA,
WAW_HARAKA: DAMMA,
YEH_HARAKA : KASRA,
ALTERNATIVE_YEH_HARAKA: DAMMA,
}
##WAW_MAKSURA = WAW
#HARAKAT = u"%s%s%s%s%s"%(SUKUN, FATHA, DAMMA, KASRA, SHADDA)
HARAKAT = (SUKUN, FATHA, DAMMA, KASRA)
HARAKAT2 = u"".join([ALEF_HARAKA, WAW_HARAKA, YEH_HARAKA, SUKUN,
FATHA, DAMMA, KASRA])
HAMZAT_PATTERN = re.compile(u"[%s%s%s%s%s]"%(ALEF_HAMZA_ABOVE, WAW_HAMZA,
YEH_HAMZA , HAMZA, ALEF_HAMZA_BELOW), re.UNICODE)
HAMZAT = (ALEF_HAMZA_ABOVE, WAW_HAMZA, YEH_HAMZA , HAMZA, ALEF_HAMZA_BELOW)
LAM_ALEF_PAT = re.compile(u'[\ufef7\ufef9\ufef5]', re.UNICODE)
#uniformate harkat
UNIFORMATE_MARKS_4 = FATHA+SUKUN+FATHA+FATHA
UNIFORMATE_MARKS_5TEH = FATHA+FATHA+SUKUN+FATHA+FATHA
UNIFORMATE_MARKS_5 = KASRA+SUKUN+FATHA+FATHA+FATHA
UNIFORMATE_MARKS_6 = KASRA+SUKUN+FATHA+SUKUN+FATHA+FATHA
BEGIN_WORD = u"^"
END_WORD = u"$"
LONG_HARAKAT = (ALEF_HARAKA, YEH_HARAKA, WAW_HARAKA, ALEF_YEH_HARAKA,
CONVERSION_TABLE = {
ALEF_YEH_HARAKA: KASRA,
ALEF_WAW_HARAKA: DAMMA,
WAW_HARAKA: DAMMA,
YEH_HARAKA : KASRA,
ALTERNATIVE_YEH_HARAKA: DAMMA,
}
##WAW_MAKSURA = WAW
#HARAKAT = u"%s%s%s%s%s"%(SUKUN, FATHA, DAMMA, KASRA, SHADDA)
HARAKAT = (SUKUN, FATHA, DAMMA, KASRA)
HARAKAT2 = u"".join([ALEF_HARAKA, WAW_HARAKA, YEH_HARAKA, SUKUN,
FATHA, DAMMA, KASRA])
HAMZAT_PATTERN = re.compile(u"[%s%s%s%s%s]"%(ALEF_HAMZA_ABOVE, WAW_HAMZA,
YEH_HAMZA , HAMZA, ALEF_HAMZA_BELOW), re.UNICODE)
HAMZAT = (ALEF_HAMZA_ABOVE, WAW_HAMZA, YEH_HAMZA , HAMZA, ALEF_HAMZA_BELOW)
LAM_ALEF_PAT = re.compile(u'[\ufef7\ufef9\ufef5]', re.UNICODE)
#uniformate harkat
UNIFORMATE_MARKS_4 = FATHA+SUKUN+FATHA+FATHA
UNIFORMATE_MARKS_5TEH = FATHA+FATHA+SUKUN+FATHA+FATHA
UNIFORMATE_MARKS_5 = KASRA+SUKUN+FATHA+FATHA+FATHA
UNIFORMATE_MARKS_6 = KASRA+SUKUN+FATHA+SUKUN+FATHA+FATHA
BEGIN_WORD = u"^"
END_WORD = u"$"
LONG_HARAKAT = (ALEF_HARAKA, YEH_HARAKA, WAW_HARAKA, ALEF_YEH_HARAKA,
ALEF_WAW_HARAKA)
_F = FATHA
def highlight_diacritics_html(self, text):
"""
Highlight dfiactitics in the HTML text.
@param text: the given text
@type text: unicode.
@return: the result as HTML.
@rtype: unicode.
"""
hight_text = u""
lefttag = u"<span class="tashkeel">"
righttag = u"</span>"
for i in range(len(text)):
if text[i] in (araby.FATHA, araby.DAMMA, araby.KASRA, araby.SUKUN):
if (i>0 and text[i-1] not in (araby.ALEF,
araby.ALEF_HAMZA_ABOVE, araby.WAW_HAMZA, araby.ALEF_MADDA,
araby.DAL, araby.THAL, araby.WAW, araby.REH, araby.ZAIN,
araby.SHADDA)) and (i+1%s"%text[i]
hight_text += u"".join([lefttag, " ", text[i], righttag])
else:
hight_text += text[i]
return hight_text
"""
vocalize a foreign names written in arabic
@param word: given word
@type word: unicode
@return: the vocalized word
@rtype: unicode
"""
marks =[]
previous = ""
for c in word:
if previous and not previous == araby.ALEF:
#--------- add Harakat before letter
if c in (araby.ALEF, araby.ALEF_MAKSURA, araby.TEH_MARBUTA,):
marks.pop()
marks.append(araby.FATHA)
elif c in (araby.WAW, araby.WAW_HAMZA):
marks.pop()
marks.append(araby.DAMMA)
elif c in( araby.YEH , araby.YEH_HAMZA ):
marks.pop()
marks.append(araby.KASRA)
#--------- add Harakat before letter
if c in (araby.ALEF_HAMZA_BELOW):
marks.append(araby.KASRA)
elif previous in (araby.ALEF_HAMZA_BELOW, araby.ALEF_HAMZA_ABOVE):
marks.append(araby.SUKUN)
elif previous in (araby.ALEF, araby.YEH, araby.WAW):
if c == araby.YEH_HAMZA :
marks.append(araby.KASRA)
else:
marks.append(araby.NOT_DEF_HARAKA)
previous = c
# $Author: Taha Zerrouki $
# $Revision: 0.7 $
# $Source: arabtechies.sourceforge.net
#
#***********************************************************************/
import sys
import re
import time
import pyarabic.araby as araby
# treat the root, strip extra characters
stamp_pat = re.compile(u"[%s%s%s%s%s%s%s%s%s]"% (araby.ALEF,
araby.YEH, araby.HAMZA, araby.ALEF_HAMZA_ABOVE, araby.WAW_HAMZA,
araby.YEH_HAMZA, araby.WAW, araby.ALEF_MAKSURA, araby.SHADDA),
re.UNICODE)
def word_stamp(word):
"""
generate a stamp for a word,
remove all letters which can change form in the word :
- ALEF,
- HAMZA,
- YEH,
- WAW,
- ALEF_MAKSURA
- SHADDA
@return: stamped word
"""
# strip the last letter if is doubled
u'آهل':[u'أءهل'],
u'آوب':[u'ءاوب'],
u'آوى':[u'أءوى'],
u'آيد':[u'ءايد'],
u'آيس':[u'أءيس'],
}
STANDARD_REPLACEMENT=[
#-تحويل همزة القطع على الألف بعدها فتحة
#وهمزة القطع على الألف بعدها سكون إلى ألف ممدودة
( u"".join([ALEF_HAMZA_ABOVE, FATHA, ALEF]), ALEF_MADDA)
, ( u"".join([ALEF_MADDA, FATHA]), ALEF_MADDA)
, ( u"".join([ALEF_MADDA, ALEF]), ALEF_MADDA)
, ( u"".join([ALEF_HAMZA_ABOVE, FATHA, ALEF_HAMZA_ABOVE, SUKUN]), ALEF_MADDA)
, ( u"".join([ALEF_HAMZA_ABOVE, FATHA, ALEF_HAMZA_ABOVE, FATHA]), ALEF_MADDA)
, ( u"".join([ALEF_HAMZA_ABOVE, DAMMA, WAW_HAMZA, SUKUN]), ALEF_HAMZA_ABOVE+DAMMA+WAW)
, ( u"".join([YEH, SHADDA, FATHA, ALEF_MAKSURA]), YEH+SHADDA+FATHA+ALEF)
# إدغام النون الساكنة
, ( u"".join([NOON, SUKUN, NOON]), NOON+SHADDA)
# إذا كان الحرف الأول ساكنا وبعده شدة، ثم أضيفت إليه الألف
, ( u"".join([SUKUN, SHADDA]), SHADDA)
## معالجة ألف التفريق
, ( ALEF_WASLA, ALEF)
## معالجة ألف التفريق
, ( ALEF_MAMDUDA, ALEF)
_A = ALEF_HARAKA
_W = WAW_HARAKA
_Y = YEH_HARAKA
_AH = ALEF_HARAKA
_YH = YEH_HARAKA
_WH = WAW_HARAKA
_AYH = ALEF_YEH_HARAKA
_AWH = ALEF_WAW_HARAKA
_YHALT = ALTERNATIVE_YEH_HARAKA
#HAMZAT
_AHA = ALEF_HAMZA_ABOVE
_AHB = ALEF_HAMZA_BELOW
_AM = ALEF_MADDA
_YHA = YEH_HAMZA
_WHA = WAW_HAMZA
_HZ = HAMZA
INITIAL_TAHMEEZ_TABLE = {_S:_HZ, _F:_AHA, _D:_AHA, _K:_AHB, _AH:_AM ,
_WH:_AHA, _YH:_AHB, _YHALT:_AHB}
MIDDLE_TAHMEEZ_TABLE = {
_S: {_S:_HZ, _F:_AHA, _D:_WHA, _K:_YHA, _AH:_AHA, _WH:_WHA, _YH:_YHA },
_F: {_S:_AHA, _F:_AHA, _D:_WHA, _K:_YHA, _AH:_AHA, _WH:_WHA, _YH:_YHA },
_D: {_S:_WHA, _F:_WHA, _D:_WHA, _K:_YHA, _AH:_WHA, _WH:_WHA, _YH:_YHA },
_K: {_S:_YHA, _F:_YHA, _D:_YHA, _K:_YHA, _AH:_YHA, _WH:_YHA, _YH:_YHA },
_AH: {_S:_HZ, _F:_HZ, _D:_WHA, _K:_YHA, _AH:_HZ, _WH:_WHA, _YH:_YHA },
_WH: {_S:_HZ, _F:_HZ, _D:_WHA, _K:_YHA, _AH:_HZ, _WH:_WHA, _YH:_YHA },
_YH: {_S:_YHA, _F:_YHA, _D:_YHA, _K:_YHA, _AH:_YHA, _WH:_YHA, _YH:_YHA },
}
u'آهل':[u'أءهل'],
u'آوب':[u'ءاوب'],
u'آوى':[u'أءوى'],
u'آيد':[u'ءايد'],
u'آيس':[u'أءيس'],
}
STANDARD_REPLACEMENT=[
#-تحويل همزة القطع على الألف بعدها فتحة
#وهمزة القطع على الألف بعدها سكون إلى ألف ممدودة
( u"".join([ALEF_HAMZA_ABOVE, FATHA, ALEF]), ALEF_MADDA)
, ( u"".join([ALEF_MADDA, FATHA]), ALEF_MADDA)
, ( u"".join([ALEF_MADDA, ALEF]), ALEF_MADDA)
, ( u"".join([ALEF_HAMZA_ABOVE, FATHA, ALEF_HAMZA_ABOVE, SUKUN]), ALEF_MADDA)
, ( u"".join([ALEF_HAMZA_ABOVE, FATHA, ALEF_HAMZA_ABOVE, FATHA]), ALEF_MADDA)
, ( u"".join([ALEF_HAMZA_ABOVE, DAMMA, WAW_HAMZA, SUKUN]), ALEF_HAMZA_ABOVE+DAMMA+WAW)
, ( u"".join([YEH, SHADDA, FATHA, ALEF_MAKSURA]), YEH+SHADDA+FATHA+ALEF)
# إدغام النون الساكنة
, ( u"".join([NOON, SUKUN, NOON]), NOON+SHADDA)
# إذا كان الحرف الأول ساكنا وبعده شدة، ثم أضيفت إليه الألف
, ( u"".join([SUKUN, SHADDA]), SHADDA)
## معالجة ألف التفريق
, ( ALEF_WASLA, ALEF)
## معالجة ألف التفريق
, ( ALEF_MAMDUDA, ALEF)
_A = ALEF_HARAKA
_W = WAW_HARAKA
_Y = YEH_HARAKA
_AH = ALEF_HARAKA
_YH = YEH_HARAKA
_WH = WAW_HARAKA
_AYH = ALEF_YEH_HARAKA
_AWH = ALEF_WAW_HARAKA
_YHALT = ALTERNATIVE_YEH_HARAKA
#HAMZAT
_AHA = ALEF_HAMZA_ABOVE
_AHB = ALEF_HAMZA_BELOW
_AM = ALEF_MADDA
_YHA = YEH_HAMZA
_WHA = WAW_HAMZA
_HZ = HAMZA
INITIAL_TAHMEEZ_TABLE = {_S:_HZ, _F:_AHA, _D:_AHA, _K:_AHB, _AH:_AM ,
_WH:_AHA, _YH:_AHB, _YHALT:_AHB}
MIDDLE_TAHMEEZ_TABLE = {
_S: {_S:_HZ, _F:_AHA, _D:_WHA, _K:_YHA, _AH:_AHA, _WH:_WHA, _YH:_YHA },
_F: {_S:_AHA, _F:_AHA, _D:_WHA, _K:_YHA, _AH:_AHA, _WH:_WHA, _YH:_YHA },
_D: {_S:_WHA, _F:_WHA, _D:_WHA, _K:_YHA, _AH:_WHA, _WH:_WHA, _YH:_YHA },
_K: {_S:_YHA, _F:_YHA, _D:_YHA, _K:_YHA, _AH:_YHA, _WH:_YHA, _YH:_YHA },
_AH: {_S:_HZ, _F:_HZ, _D:_WHA, _K:_YHA, _AH:_HZ, _WH:_WHA, _YH:_YHA },
_WH: {_S:_HZ, _F:_HZ, _D:_WHA, _K:_YHA, _AH:_HZ, _WH:_WHA, _YH:_YHA },
_YH: {_S:_YHA, _F:_YHA, _D:_YHA, _K:_YHA, _AH:_YHA, _WH:_YHA, _YH:_YHA },
}