| import gradio as gr | |
| from datasets import load_dataset | |
| import re | |
| import numpy as np | |
| dataset = load_dataset("mohamedabdullah/Arabic-unique-words", data_files="ar_vocab.txt") | |
| word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0]) | |
| vocab = set(word_l) | |
| def delete_letter(word): | |
| return [word[:i]+word[i+1:] for i in range(len(word))] | |
| def switch_letter(word): | |
| switch_l = [] | |
| for i in range(len(word)-1): | |
| w_l = re.findall('\w', word) | |
| if i-1 < 0: | |
| w_l[i:i+2] = w_l[i+1::-1] | |
| else: | |
| w_l[i:i+2] = w_l[i+1:i-1:-1] | |
| switch_l.append(''.join(w_l)) | |
| return switch_l | |
| def replace_letter(word): | |
| letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ' | |
| replace_set = set() | |
| for i in range(len(word)): | |
| for l in letters: | |
| new_word = word[:i]+l+word[i+1:] | |
| if new_word == word: | |
| continue | |
| replace_set.add(new_word) | |
| replace_l = sorted(list(replace_set)) | |
| return replace_l | |
| def insert_letter(word): | |
| letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ' | |
| insert_l = [] | |
| for i in range(len(word)+1): | |
| for l in letters: | |
| new_word = word[:i]+l+word[i:] | |
| insert_l.append(new_word) | |
| return insert_l | |
| def edit_one_letter(word, allow_switches = True): | |
| edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word) | |
| if allow_switches: | |
| edit_one_set += switch_letter(word) | |
| return set(edit_one_set) | |
| def edit_two_letters(word, allow_switches = True): | |
| edit_two_set = [] | |
| edit_one_set = edit_one_letter(word) | |
| for edit in edit_one_set: | |
| edit_two_set += edit_one_letter(edit) | |
| return set(edit_two_set) | set(edit_one_set) | |
| def get_corrections(word, vocab): | |
| suggestions = [] | |
| correct_word_suggest = [word] if word in vocab else [] | |
| edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word)))) | |
| edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word)))) | |
| suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest or ['لم يتم العثور علي إقتراحات مناسبة لهذه الكلمة'] | |
| return set(suggestions) | |
| def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2): | |
| m = len(source) | |
| n = len(target) | |
| D = np.zeros((m+1, n+1), dtype=int) | |
| for row in range(1, m+1): | |
| D[row,0] = D[row-1,0]+del_cost | |
| for col in range(1, n+1): | |
| D[0,col] = D[0, col-1]+ins_cost | |
| for row in range(1, m+1): | |
| for col in range(1, n+1): | |
| r_cost = rep_cost | |
| if source[row-1] == target[col-1]: | |
| r_cost = 0 | |
| D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost]) | |
| med = D[m,n] | |
| return med | |
| def get_suggestions(corrections, word): | |
| distance = [] | |
| suggest = [] | |
| for correction in corrections: | |
| source = word | |
| target = correction | |
| min_edits = min_edit_distance(source, target) | |
| distance.append(min_edits) | |
| suggest.append(correction) | |
| suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance))) | |
| return suggest_result | |
| def ar_spelling_checker(text): | |
| word_l = re.findall('\w{3,}', text) | |
| result = {} | |
| for word in word_l: | |
| tmp_corrections = [] | |
| if not word in vocab: | |
| tmp_corrections = get_corrections(word, vocab) | |
| if len(tmp_corrections) == 0: | |
| continue | |
| result[word] = get_suggestions(tmp_corrections, word) | |
| output = '''<style> | |
| .content{ | |
| direction: rtl; | |
| } | |
| .word{ | |
| color: #842029; | |
| background-color: #f8d7da; | |
| border-color: #f5c2c7; | |
| padding: 10px 20px; | |
| display: inline-block; | |
| direction: rtl; | |
| font-size: 15px; | |
| font-weight: 500; | |
| margin-bottom: 15px; | |
| box-sizing: border-box; | |
| border: 1px solid transparent; | |
| border-radius: 0.25rem; | |
| } | |
| .suggest{ | |
| color: #0f5132; | |
| background-color: #d1e7dd; | |
| border-color: #badbcc; | |
| display: inline-block; | |
| margin-right: 5px; | |
| } | |
| .separator{ | |
| height:3px; | |
| background: #CCC; | |
| margin-bottom: 15px; | |
| } | |
| .msg{ | |
| color: #0f5132; | |
| background-color: #d1e7dd; | |
| border-color: #badbcc; | |
| border: 1px solid transparent; | |
| border-radius: 0.25rem; | |
| padding: 15px 20px; | |
| direction: rtl; | |
| font-size: 20px; | |
| font-weight: 500; | |
| text-align: center; | |
| } | |
| </style>''' | |
| output += '<div class="content">' | |
| if len(result.keys()) == 0: | |
| output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>' | |
| for word in result.keys(): | |
| output += f'<div class="word">{word}</div><br />' | |
| for suggest in result[word]: | |
| output += f'<div class="word suggest">{suggest}</div>' | |
| output += '<div class="separator"></div>' | |
| output += '</div>' | |
| return output | |
| with gr.Blocks(css=""" | |
| #input{direction: rtl;} | |
| #component-112{height: 30px;} | |
| .gr-form{margin-top: 15px;} | |
| .gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;} | |
| .text-gray-500{font-size: 16px; margin-bottom: 13px;} | |
| .gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe; | |
| border: 1px solid transparent; border-radius: 0.25rem; | |
| padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';} | |
| .output-html{min-height: 2rem;} | |
| .title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%; | |
| line-height: 1.5;font-family: 'IBM Plex Mono';} | |
| .desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo: | |
| intro = gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>') | |
| description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>') | |
| text = gr.Textbox(label="النص", elem_id="input") | |
| btn = gr.Button("Check Spelling ") | |
| output = gr.HTML() | |
| btn.click(ar_spelling_checker, [text], output) | |
| demo.launch(inline=False) |