Spaces:
Sleeping
Sleeping
Add barchart showing shortest/longest median tokenized text
Browse files
app.py
CHANGED
|
@@ -9,6 +9,8 @@ import seaborn as sns
|
|
| 9 |
import numpy as np
|
| 10 |
import plotly.figure_factory as ff
|
| 11 |
import plotly.express as px
|
|
|
|
|
|
|
| 12 |
import random, glob
|
| 13 |
|
| 14 |
@st.cache_data
|
|
@@ -51,7 +53,9 @@ tokenizer_names_to_test = [
|
|
| 51 |
with st.sidebar:
|
| 52 |
|
| 53 |
st.header('All languages are NOT created (tokenized) equal!')
|
| 54 |
-
link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese).
|
|
|
|
|
|
|
| 55 |
st.markdown(link)
|
| 56 |
|
| 57 |
st.header('Data Visualization')
|
|
@@ -130,7 +134,33 @@ with st.container():
|
|
| 130 |
)
|
| 131 |
st.plotly_chart(fig, use_container_width=True)
|
| 132 |
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
|
| 136 |
|
|
|
|
| 9 |
import numpy as np
|
| 10 |
import plotly.figure_factory as ff
|
| 11 |
import plotly.express as px
|
| 12 |
+
from plotly.subplots import make_subplots
|
| 13 |
+
import plotly.graph_objects as go
|
| 14 |
import random, glob
|
| 15 |
|
| 16 |
@st.cache_data
|
|
|
|
| 53 |
with st.sidebar:
|
| 54 |
|
| 55 |
st.header('All languages are NOT created (tokenized) equal!')
|
| 56 |
+
link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese)."
|
| 57 |
+
st.markdown(link)
|
| 58 |
+
link="This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
|
| 59 |
st.markdown(link)
|
| 60 |
|
| 61 |
st.header('Data Visualization')
|
|
|
|
| 134 |
)
|
| 135 |
st.plotly_chart(fig, use_container_width=True)
|
| 136 |
|
| 137 |
+
|
| 138 |
+
# Create figures using px.bar
|
| 139 |
+
shortest = val_data.groupby('lang')[tokenizer_name].median().sort_values().head(7).reset_index()
|
| 140 |
+
shortest["type"] = "shortest"
|
| 141 |
+
longest = val_data.groupby('lang')[tokenizer_name].median().sort_values().tail(7).reset_index()
|
| 142 |
+
longest["type"] = "longest"
|
| 143 |
+
combined = pd.concat([shortest, longest]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)
|
| 144 |
+
color_sequence = px.colors.qualitative.D3 # You can choose other built-in sequences or define your own
|
| 145 |
+
fig = px.bar(combined, x=tokenizer_name, y="lang", orientation='h', color='type', color_discrete_sequence=color_sequence)
|
| 146 |
+
fig.update_traces(hovertemplate='%{y}: %{x} tokens')
|
| 147 |
+
fig.update_layout(
|
| 148 |
+
title=dict(text='Top Langs with Shortest and Longest Median Token Lengths',
|
| 149 |
+
font=dict(size=25), automargin=True, yref='paper', pad=dict(b=20)), # Add more padding below the title
|
| 150 |
+
# title='Distribution of tokens',
|
| 151 |
+
xaxis=dict(
|
| 152 |
+
title="Number of Tokens",
|
| 153 |
+
showgrid=True, # Show vertical gridlines
|
| 154 |
+
gridwidth=1, # Gridline width
|
| 155 |
+
gridcolor='LightGrey' # Gridline color
|
| 156 |
+
),
|
| 157 |
+
yaxis=dict(
|
| 158 |
+
title="",
|
| 159 |
+
),
|
| 160 |
+
height=400,
|
| 161 |
+
showlegend=False # Remove the legend
|
| 162 |
+
)
|
| 163 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 164 |
|
| 165 |
|
| 166 |
|