Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -245,17 +245,22 @@ def get_results(tokenizer_name, base_lang, comp_lang, HF_token=""):
|
|
| 245 |
adverb = "more"
|
| 246 |
token_ratio = (token_ratio - 1.) * 100
|
| 247 |
|
| 248 |
-
output = f"You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}
|
| 249 |
return output
|
| 250 |
|
| 251 |
|
| 252 |
with gr.Blocks() as demo:
|
| 253 |
-
with gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
with gr.Column():
|
| 255 |
-
|
| 256 |
-
tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
|
| 257 |
-
with gr.Row():
|
| 258 |
-
HF_token = gr.Textbox(label="your HF Token")
|
| 259 |
|
| 260 |
with gr.Row():
|
| 261 |
with gr.Column():
|
|
|
|
| 245 |
adverb = "more"
|
| 246 |
token_ratio = (token_ratio - 1.) * 100
|
| 247 |
|
| 248 |
+
output = f"**You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}.**"
|
| 249 |
return output
|
| 250 |
|
| 251 |
|
| 252 |
with gr.Blocks() as demo:
|
| 253 |
+
with gr.Row():
|
| 254 |
+
gr.Markdown("""<h1>Language tokenization comparison</h1>
|
| 255 |
+
This tool will help you calculate the how many more or less tokens you need to tokenize text in different languages.
|
| 256 |
+
To perform this comparison we are using [FLORES](https://github.com/facebookresearch/flores/tree/main) dataset, developed by meta, which presents translations between English and low-resource languages.
|
| 257 |
+
We first tokenize around 1000 texts to the base language and to the language we want to compare. After that, we get average of inputs_ids lenght.""")
|
| 258 |
+
|
| 259 |
+
with gr.Row():
|
| 260 |
+
with gr.Column():
|
| 261 |
+
tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
|
| 262 |
with gr.Column():
|
| 263 |
+
HF_token = gr.Textbox(label="your HF Token")
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
with gr.Row():
|
| 266 |
with gr.Column():
|