File size: 4,390 Bytes
36a5d2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from tensorflow.keras.preprocessing.text import text_to_word_sequence # type: ignore
import numpy as np
from transformers import GPT2Tokenizer


class TokenizerWrapper:
    def __init__(self, class_name, max_caption_length, tokenizer_num_words=None):
        # dataset_df = pd.read_csv(dataset_csv_file)
        # sentences = dataset_df[class_name].tolist()
        self.max_caption_length = max_caption_length
        self.tokenizer_num_words = tokenizer_num_words
        # self.init_tokenizer(sentences)
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_prefix_space=True)
        self.gpt2_tokenizer.pad_token = "<"

    def clean_sentence(self, sentence):
        return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')

    def GPT2_pad_token_id(self):
        return self.gpt2_tokenizer.pad_token_id

    def GPT2_eos_token_id(self):
        return self.gpt2_tokenizer.eos_token_id

    def GPT2_encode(self, sentences, pad=True, max_length=None):
        if max_length is None:
            max_length = self.max_caption_length
        if isinstance(sentences, str):
            return self.gpt2_tokenizer.encode(sentences, add_special_tokens=True, max_length=max_length,
                                              pad_to_max_length=pad)
        tokens = np.zeros((sentences.shape[0], max_length), dtype=int)

        for i in range(len(sentences)):
            if pd.isna(sentences[i]):
                sentences[i][0] = ""
            sentence = sentences[i][0].lower()
            sentence = sentence.replace('"', '')
            sentence = sentence.replace('xxxx', '')
            sentence = sentence.replace('endseq', '<|endoftext|>')
            tokens[i] = self.gpt2_tokenizer.encode(sentence, add_special_tokens=True,
                                                   max_length=max_length, pad_to_max_length=pad)
        return tokens

    def GPT2_decode(self, tokens):
        return self.gpt2_tokenizer.decode(tokens, skip_special_tokens=True)

    def GPT2_format_output(self, sentence):
        sentence = self.clean_sentence(sentence)
        return sentence

    def filter_special_words(self, sentence):
        sentence = sentence.replace('startseq', '')
        sentence = sentence.replace('endseq', '')
        sentence = sentence.replace('<|endoftext|>', '')
        sentence = sentence.replace('<', '')
        sentence = sentence.strip()
        return sentence

    def init_tokenizer(self, sentences):

        for i in range(len(sentences)):
            if pd.isna(sentences[i]):
                sentences[i] = ""
            sentences[i] = self.clean_sentence(sentences[i])
        self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words)
        self.tokenizer.fit_on_texts(sentences)  # give each word a unique id
 

    def get_tokenizer_num_words(self):
        return self.tokenizer_num_words

    def get_token_of_word(self, word):
        return self.tokenizer.word_index[word]

    def get_word_from_token(self, token):
        try:
            return self.tokenizer.index_word[token]
        except:
            return ""

    def get_sentence_from_tokens(self, tokens):
        sentence = []
        for token in tokens[0]:
            word = self.get_word_from_token(token)
            if word == 'endseq':
                return sentence
            if word != 'startseq':
                sentence.append(word)

        return sentence

    def get_string_from_word_list(self, word_list):

        return " ".join(word_list)

    def get_word_tokens_list(self):
        return self.tokenizer.word_index

    def tokenize_sentences(self, sentences):
        index = 0
        tokenized_sentences = np.zeros((sentences.shape[0], self.max_caption_length), dtype=int)
        for caption in sentences:
            tokenized_caption = self.tokenizer.texts_to_sequences([self.clean_sentence(caption[0])])
            tokenized_sentences[index] = pad_sequences(tokenized_caption, maxlen=self.max_caption_length,
                                                       padding='post')  # padded with max length
            index = index + 1
        return tokenized_sentences