Spaces:

jason9693
/

KoreanHateSpeechClassifier

Runtime error

App Files Files Community

kevin-yang commited on Oct 25, 2021

Commit

b1944b2

1 Parent(s): cf3d4e2

initial commit

Browse files

Files changed (9) hide show

NanumGothicCoding-Bold.ttf +0 -0
NanumGothicCoding.ttf +0 -0
__pycache__/bertviz.cpython-38.pyc +0 -0
__pycache__/util.cpython-36.pyc +0 -0
attention.py +97 -0
bvz.py +10 -0
requirements.txt +5 -0
test_demp.py +38 -0
util.py +384 -0

NanumGothicCoding-Bold.ttf ADDED Viewed

Binary file (1.8 MB). View file

NanumGothicCoding.ttf ADDED Viewed

Binary file (2.78 MB). View file

__pycache__/bertviz.cpython-38.pyc ADDED Viewed

Binary file (533 Bytes). View file

__pycache__/util.cpython-36.pyc ADDED Viewed

Binary file (6.01 kB). View file

attention.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+import gradio as gr
+from torch.nn import functional as F
+import seaborn
+import matplotlib
+import platform
+if platform.system() == "Darwin":
+    print("MacOS")
+    matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import io
+from PIL import Image
+import matplotlib.font_manager as fm
+import util
+font_path = r'NanumGothicCoding.ttf'
+fontprop = fm.FontProperties(fname=font_path, size=18)
+plt.rcParams["font.family"] = 'NanumGothic'
+def visualize_attention(sent, attention_matrix, n_words=10):
+    def draw(data, x, y, ax):
+        seaborn.heatmap(data,
+                        xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
+                        cbar=False, ax=ax)
+    # make plt figure with 1x6 subplots
+    fig = plt.figure(figsize=(16, 8))
+    # fig.subplots_adjust(hspace=0.7, wspace=0.2)
+    for i, layer in enumerate(range(1, 12, 2)):
+        ax = fig.add_subplot(2, 3, i+1)
+        ax.set_title("Layer {}".format(layer))
+        draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
+    fig.tight_layout()
+    plt.close()
+    return fig
+def predict(model_name, text):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    config = AutoConfig.from_pretrained(model_name)
+    print(config.id2label)
+    tokenized_text = tokenizer([text], return_tensors='pt')
+    input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
+    print(input_tokens)
+    input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
+    model.eval()
+    output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
+    output = F.softmax(output, dim=-1)
+    result = {}
+    for idx, label in enumerate(output[0].detach().numpy()):
+        result[config.id2label[idx]] = float(label)
+    fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
+    return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
+if __name__ == '__main__':
+    model_name = 'jason9693/SoongsilBERT-beep-base'
+    text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
+    # output = predict(model_name, text)
+    # print(output)
+    model_name_list = [
+        'jason9693/SoongsilBERT-beep-base'
+    ]
+    #Create a gradio app with a button that calls predict()
+    app = gr.Interface(
+        fn=predict,
+        server_port=26899,
+        server_name='0.0.0.0',
+        inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
+        examples = [[model_name, text]],
+        title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
+        description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT"
+        )
+    app.launch(inline=False)

bvz.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import AutoTokenizer, AutoModel
+from bertviz import model_view
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+model = AutoModel.from_pretrained("distilbert-base-uncased", output_attentions=True)
+inputs = tokenizer.encode("The cat sat on the mat", return_tensors='pt')
+outputs = model(inputs)
+attention = outputs[-1]  # Output includes attention weights when output_attentions=True
+tokens = tokenizer.convert_ids_to_tokens(inputs[0])
+model_view(attention, tokens)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers==4.3.0
+torch==1.6.0
+matplotlib
+seaborn
+numpy

test_demp.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+def stock_forecast(final_year, companies, noise, show_legend, point_style):
+    start_year = 2020
+    x = np.arange(start_year, final_year + 1)
+    year_count = x.shape[0]
+    plt_format = ({"cross": "X", "line": "-", "circle": "o--"})[point_style]
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    for i, company in enumerate(companies):
+        series = np.arange(0, year_count, dtype=float)
+        series = series ** 2 * (i + 1)
+        series += np.random.rand(year_count) * noise
+        ax.plot(x, series, plt_format)
+    if show_legend:
+        plt.legend(companies)
+    plt.close()
+    return fig
+iface = gr.Interface(
+    stock_forecast,
+    [
+        gr.inputs.Radio([2025, 2030, 2035, 2040], label="Project to:"),
+        gr.inputs.CheckboxGroup(["Google", "Microsoft", "Gradio"]),
+        gr.inputs.Slider(1, 100),
+        "checkbox",
+        gr.inputs.Dropdown(["cross", "line", "circle"], label="Style")],
+    gr.outputs.Image(plot=True, label="forecast"))
+iface.test_launch()
+if __name__ == "__main__":
+    iface.launch(inline=False)

util.py ADDED Viewed

	@@ -0,0 +1,384 @@

+from functools import lru_cache
+@lru_cache()
+def bytes_to_unicode_dict():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(cs, bs))
+ORD_UNICODE_MAP = bytes_to_unicode_dict()
+@lru_cache()
+def byte_to_char(bytestr):
+    return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace")
+# @lru_cache()
+def bytetokens_to_unicdode(byte_tokens: list):
+    return [byte_to_char(token) for token in byte_tokens]
+if __name__ == '__main__':
+    tokens = ['<s>',
+        'ì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        'íĶĦëĿ¼ìĿ´',
+        'ì¦Ī',
+        '(',
+        'ëĮĢíĳľ',
+        'Ġë°±',
+        'ìĥģ',
+        'ìĹ½',
+        ')',
+        'ê°Ģ',
+        'Ġìĺ¬íķ´',
+        'Ġ8',
+        'ìĽĶ',
+        'Ġê¸°ì¤Ģ',
+        'Ġëĭ¤ìĪĺ',
+        'Ġê¶Į',
+        'ìľĦ',
+        'ĠìŀĪëĬĶ',
+        'Ġê¸Ģë¡ľë²Į',
+        'ĠíķĻ',
+        'íļĮìĹĲìĦľ',
+        'Ġì´Ŀ',
+        'Ġ16',
+        'ê±´',
+        'ìĿĺ',
+        'ĠìĿ¸ê³µ',
+        'ì§Ģ',
+        'ëĬ¥',
+        '(',
+        'A',
+        'I',
+        ')',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġëĵ±',
+        'ìŀ¬',
+        'íĸĪëĭ¤ê³ł',
+        'Ġ9',
+        'ìĿ¼',
+        'Ġë°ĿíĺĶ',
+        'ëĭ¤',
+        '.',
+        'Ġì§ĢëĤľíķ´',
+        'Ġëĵ±',
+        'ìŀ¬',
+        'íķľ',
+        'Ġ13',
+        'ê±´ë',
+        '³´ëĭ¤',
+        'Ġ3',
+        'ê±´',
+        'Ġë§İìĿĢ',
+        'Ġëħ¼ë¬¸',
+        'ìĿ´',
+        'Ġë°ĺ',
+        'ëħĦ',
+        'ìĹ¬',
+        'Ġë§ĮìĹĲ',
+        'Ġì±Ħ',
+        'íĥĿ',
+        'ëĲĲëĭ¤',
+        '.',
+        'Ġì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        'íĶĦëĿ¼ìĿ´',
+        'ì¦Ī',
+        '(',
+        'ìĿ´',
+        'íķĺ',
+        'Ġì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        ')',
+        'ëĬĶ',
+        'ĠA',
+        'I',
+        'ĠìĹ°êµ¬',
+        'ĠìĦ±',
+        'ê³¼ë¥¼',
+        'ĠìĿ´',
+        'ìĸ´ê°Ģ',
+        'ê¸°',
+        'ĠìľĦíķ´',
+        'ĠìĿ¸ìŀ¬',
+        'ĠíĻķë³´',
+        'ìĹĲ',
+        'ĠìĨį',
+        'ëıĦë¥¼',
+        'ĠëĨĴìĿ´',
+        'ê²łëĭ¤ëĬĶ',
+        'Ġë°©',
+        'ì¹¨',
+        'ìĿ´ëĭ¤',
+        '.',
+        'Ċ',
+        'Ċ',
+        'ì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        'ëĬĶ',
+        'Ġ8',
+        'ìĽĶ',
+        'ĠìŀĲìĹ°',
+        'ìĸ´',
+        'ì²ĺë¦¬',
+        'Ġë¶Ħìķ¼',
+        'ìĿĺ',
+        'Ġê¸Ģë¡ľë²Į',
+        'Ġíĥĳ',
+        'ĠíķĻ',
+        'íļĮ',
+        'ìĿ¸',
+        "Ġ'",
+        'A',
+        'C',
+        'L',
+        '-',
+        'I',
+        'J',
+        'C',
+        'N',
+        'L',
+        'P',
+        "'",
+        'ìĹĲ',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġë°ľíĳľ',
+        'íķľ',
+        'ĠìĤ¬ë¡Ģ',
+        'ê¹Įì§Ģ',
+        'Ġíķ©',
+        'íķ´',
+        'Ġìĺ¬íķ´',
+        'Ġì´Ŀ',
+        'Ġ16',
+        'ê±´',
+        'ìĿĺ',
+        'ĠA',
+        'I',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġëĵ±',
+        'ìŀ¬',
+        'íĸĪëĭ¤ê³ł',
+        'Ġë°ĿíĺĶ',
+        'ëĭ¤',
+        '.',
+        'ĠìĿ´',
+        'Ġëħ¼ë¬¸',
+        'ìĿĢ',
+        'ĠìĿ¸ëıĦ',
+        'ë©Ķ',
+        'ìĿ¸',
+        '(',
+        'in',
+        '-',
+        'd',
+        'om',
+        'a',
+        'in',
+        ')',
+        'Ġìĥĺ',
+        'íĶĮ',
+        'ìĿĦ',
+        'ĠìĤ¬ìļ©',
+        'íķ´',
+        'ĠìŀĲìĹ°',
+        'ìĸ´',
+        'Ġê³µê²©',
+        'Ġë°©ìĭĿìľ¼ë¡ľ',
+        'ĠìķĦìĽĥ',
+        'ìĺ¤',
+        'ë¸Į',
+        'ëıĦ',
+        'ë©Ķ',
+        'ìĿ¸',
+        '(',
+        'out',
+        '-',
+        'of',
+        '-',
+        'd',
+        'om',
+        'a',
+        'in',
+        ')',
+        'Ġìĥĺ',
+        'íĶĮ',
+        'ìĿĦ',
+        'ĠìŀĲëıĻ',
+        'ìľ¼ë¡ľ',
+        'ĠìĥĿ',
+        'ìĦ±',
+        ',',
+        'Ġë¶Ħ',
+        'ë¥ĺ',
+        'Ġëª¨ëį¸',
+        'ìĿĺ',
+        'Ġê°Ĳ',
+        'ì§Ģ',
+        'ĠëĬ¥ëł¥ìĿĦ',
+        'Ġíĸ¥',
+        'ìĥģ',
+        'ìĭľíĤ¤ëĬĶ',
+        'ĠëĤ´ìļ©',
+        'ìĿĺ',
+        'Ġëħ¼ë¬¸',
+        'ìĿ´ëĭ¤',
+        '.',
+        'Ċ',
+        'Ċ',
+        '7',
+        'ìĽĶ',
+        'ìĹĲëĬĶ',
+        'Ġë¨¸',
+        'ìĭł',
+        'ëŁ¬',
+        'ëĭĿ',
+        'ĠíķĻ',
+        'íļĮ',
+        "Ġ'",
+        'I',
+        'C',
+        'M',
+        'L',
+        "'",
+        'ìĹĲ',
+        'Ġíļ¨ìľ¨',
+        'ìłģìĿ¸',
+        'Ġê³ł',
+        'íĴĪ',
+        'ì§Ī',
+        'ĠìĿĮ',
+        'ìĦ±',
+        'íķ©',
+        'ìĦ±ìĿ´',
+        'Ġê°ĢëĬ¥íķľ',
+        "Ġ'",
+        'ìĹĶ',
+        'ëĵľ',
+        'ĠíĪ¬',
+        'ĠìĹĶ',
+        'ëĵľ',
+        '(',
+        'en',
+        'd',
+        '-',
+        't',
+        'o',
+        '-',
+        'en',
+        'd',
+        ')',
+        "'",
+        'Ġëª¨ëį¸',
+        'ìĿĦ',
+        'ĠìłľìķĪ',
+        'íķĺëĬĶ',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġë°ľíĳľ',
+        'íĸĪëĭ¤',
+        '.',
+        'Ġ6',
+        'ìĽĶ',
+        'ìĹĲëĬĶ',
+        'ĠìĿĮ',
+        'íĸ¥',
+        'Â·',
+        'ìĿĮ',
+        'ìĦ±',
+        'Ġìĭł',
+        'íĺ¸',
+        'ì²ĺë¦¬',
+        'Ġë¶Ħìķ¼',
+        'ĠíķĻ',
+        'ìĪł',
+        'ëĮĢíļĮ',
+        "Ġ'",
+        'I',
+        'C',
+        'A',
+        'S',
+        'S',
+        'P',
+        "'",
+        'ìĹĲ',
+        'ĠëĮĢ',
+        'ê·ľëª¨',
+        'Ġíħ',
+        'į',
+        'ìĬ¤íĬ¸',
+        'Ġì½Ķ',
+        'íį¼ìĬ¤',
+        '(',
+        'ìĸ¸',
+        'ìĸ´',
+        'ĠìĹ°',
+        'êµ¬ë¥¼',
+        'ĠìľĦíķ´',
+        'Ġíħ',
+        'į',
+        'ìĬ¤íĬ¸ë¥¼',
+        'Ġì»´íĵ¨íĦ°',
+        'ê°Ģ',
+        'ĠìĿ½ìĿĦ',
+        'ĠìĪĺ',
+        'ĠìŀĪëĬĶ',
+        'Ġíĺķíĥľë¡ľ',
+        'Ġëª¨ìķĦ',
+        'ĠëĨĵìĿĢ',
+        'Ġìĸ¸ìĸ´',
+        'ĠìŀĲë£Į',
+        ')',
+        'Ġìłķë³´',
+        'ĠíķĻìĬµ',
+        'ìĹĲ',
+        'ĠëĮĢíķľ',
+        'Ġëħ¼ë¬¸',
+        'Ġ1',
+        'ê±´ìĿĦ',
+        'Ġìĭ¤',
+        'ìĹĪëĭ¤',
+        '.',
+        'Ċ',
+        '</s>']
+    import time
+    start = time.time()
+    for i in range(1000):
+        result = bytetokens_to_unicdode(tokens)
+    end = time.time()
+    print(result)
+    print(f'time: {end-start}')