danhtran2mind commited on
Commit
df08b89
·
verified ·
1 Parent(s): 30c293c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset/train/train.en filter=lfs diff=lfs merge=lfs -text
37
+ dataset/train/train.vi filter=lfs diff=lfs merge=lfs -text
38
+ dataset_convert/train/train.en filter=lfs diff=lfs merge=lfs -text
39
+ dataset_convert/train/train.vi filter=lfs diff=lfs merge=lfs -text
40
+ saved_models/en_vi_translation.keras filter=lfs diff=lfs merge=lfs -text
__pycache__/translator.cpython-311.pyc ADDED
Binary file (3.39 kB). View file
 
dataset/dev/dev.en ADDED
The diff for this file is too large to render. See raw diff
 
dataset/dev/dev.vi ADDED
The diff for this file is too large to render. See raw diff
 
dataset/test/test.en ADDED
The diff for this file is too large to render. See raw diff
 
dataset/test/test.vi ADDED
The diff for this file is too large to render. See raw diff
 
dataset/train/train.en ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c26dfeed74b6bf3752f5ca552f2412456f0de153f7c804df8717931fb3a5c78a
3
+ size 13603614
dataset/train/train.vi ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:707206edf2dc0280273952c7b70544ea8a1363aa69aaeb9d70514b888dc3067d
3
+ size 18074646
dataset_convert/dev/dev.en ADDED
The diff for this file is too large to render. See raw diff
 
dataset_convert/dev/dev.vi ADDED
The diff for this file is too large to render. See raw diff
 
dataset_convert/test/test.en ADDED
The diff for this file is too large to render. See raw diff
 
dataset_convert/test/test.vi ADDED
The diff for this file is too large to render. See raw diff
 
dataset_convert/train/train.en ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c26dfeed74b6bf3752f5ca552f2412456f0de153f7c804df8717931fb3a5c78a
3
+ size 13603614
dataset_convert/train/train.vi ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28fe7bf65585138135caa5d35b3426fcd37748a8c392608132decf36ae275d89
3
+ size 19722027
en-vi-translation-transformer-tensorflow.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [{"stream_name":"stderr","time":5.45447475,"data":"/usr/local/lib/python3.10/dist-packages/traitlets/traitlets.py:2915: FutureWarning: --Exporter.preprocessors=[\"nbconvert.preprocessors.ExtractOutputPreprocessor\"] for containers is deprecated in traitlets 5.0. You can pass `--Exporter.preprocessors item` ... multiple times to add items to a list.\n"}
2
+ ,{"stream_name":"stderr","time":5.454559784,"data":" warn(\n"}
3
+ ,{"stream_name":"stderr","time":5.528515974,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to html\n"}
4
+ ,{"stream_name":"stderr","time":8.239659837,"data":"[NbConvertApp] Writing 448192 bytes to __results__.html\n"}
5
+ ]
main.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from translator import Translator
3
+ from utils import tokenizer_utils
4
+ from utils.preprocessing import input_processing, output_processing
5
+ from models.transformer import Transformer
6
+ from models.encoder import Encoder
7
+ from models.decoder import Decoder
8
+ from models.layers import EncoderLayer, DecoderLayer, MultiHeadAttention, point_wise_feed_forward_network
9
+ from models.utils import masked_loss, masked_accuracy
10
+ import argparse
11
+
12
+ def main(sentences: list, model: tf.keras.Model, en_tokenizer, vi_tokenizer) -> None:
13
+ """
14
+ Translates input English sentences to Vietnamese using a pre-trained model.
15
+
16
+ Args:
17
+ sentences (list): List of English sentences to translate.
18
+ model (tf.keras.Model): The pre-trained translation model.
19
+ en_tokenizer: English tokenizer.
20
+ vi_tokenizer: Vietnamese tokenizer.
21
+ """
22
+ # Initialize the translator with tokenizers and the model
23
+ translator = Translator(en_tokenizer, vi_tokenizer, model)
24
+
25
+ # Process and translate each sentence
26
+ for sentence in sentences:
27
+ processed_sentence = input_processing(sentence)
28
+ translated_text = translator(processed_sentence)
29
+ translated_text = output_processing(translated_text)
30
+
31
+ # Display the input and translated text
32
+ print("Input:", processed_sentence)
33
+ print("Translated:", translated_text)
34
+ print("-" * 50)
35
+
36
+ if __name__ == "__main__":
37
+ # Set up argument parser
38
+ parser = argparse.ArgumentParser(
39
+ description="Translate English sentences to Vietnamese using a pre-trained transformer model.",
40
+ epilog="Example: python translate.py --sentence 'Hello, world!' --sentence 'The sun is shining.'"
41
+ )
42
+ parser.add_argument(
43
+ "--sentence",
44
+ type=str,
45
+ nargs="*",
46
+ default=[
47
+ (
48
+ "For at least six centuries, residents along a lake in the mountains of central Japan "
49
+ "have marked the depth of winter by celebrating the return of a natural phenomenon "
50
+ "once revered as the trail of a wandering god."
51
+ )
52
+ ],
53
+ help="One or more English sentences to translate (default: provided example sentence)"
54
+ )
55
+ parser.add_argument(
56
+ "--model_path",
57
+ type=str,
58
+ default="saved_models/en_vi_translation.keras",
59
+ help="Path to the pre-trained model file (default: saved_models/en_vi_translation.keras)"
60
+ )
61
+
62
+ # Parse arguments
63
+ args = parser.parse_args()
64
+
65
+ # Define custom objects required for loading the model
66
+ custom_objects = {
67
+ "Transformer": Transformer,
68
+ "Encoder": Encoder,
69
+ "Decoder": Decoder,
70
+ "EncoderLayer": EncoderLayer,
71
+ "DecoderLayer": DecoderLayer,
72
+ "MultiHeadAttention": MultiHeadAttention,
73
+ "point_wise_feed_forward_network": point_wise_feed_forward_network,
74
+ "masked_loss": masked_loss,
75
+ "masked_accuracy": masked_accuracy,
76
+ }
77
+
78
+ # Load the pre-trained model once
79
+ print("Loading model from:", args.model_path)
80
+ loaded_model = tf.keras.models.load_model(
81
+ args.model_path, custom_objects=custom_objects
82
+ )
83
+ print("Model loaded successfully.")
84
+
85
+ # Load English and Vietnamese tokenizers once
86
+ en_tokenizer, vi_tokenizer = tokenizer_utils.load_tokenizers()
87
+
88
+ # Run the translation for all provided sentences
89
+ main(sentences=args.sentence, model=loaded_model, en_tokenizer=en_tokenizer, vi_tokenizer=vi_tokenizer)
90
+
91
+ # Interactive loop for additional translations
92
+ while True:
93
+ choice = input("Would you like to translate another sentence? (Y/n): ").strip().lower()
94
+ if choice in ['no', 'n', 'quit', 'q']:
95
+ print("Exiting the program.")
96
+ break
97
+ elif choice in ['yes', 'y']:
98
+ new_sentence = input("Enter an English sentence to translate: ").strip()
99
+ if new_sentence:
100
+ main(sentences=[new_sentence], model=loaded_model, en_tokenizer=en_tokenizer, vi_tokenizer=vi_tokenizer)
101
+ else:
102
+ print("No sentence provided. Please try again.")
103
+ else:
104
+ print("Invalid input. Please enter 'y' or 'n'.")
models/__pycache__/decoder.cpython-311.pyc ADDED
Binary file (4.88 kB). View file
 
models/__pycache__/encoder.cpython-311.pyc ADDED
Binary file (4.82 kB). View file
 
models/__pycache__/layers.cpython-311.pyc ADDED
Binary file (9.45 kB). View file
 
models/__pycache__/transformer.cpython-311.pyc ADDED
Binary file (4 kB). View file
 
models/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.76 kB). View file
 
models/decoder.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from .layers import DecoderLayer
3
+
4
+ @tf.keras.utils.register_keras_serializable()
5
+ class Decoder(tf.keras.layers.Layer):
6
+ def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
7
+ max_tokens, dropout_rate, **kwargs):
8
+ super(Decoder, self).__init__(**kwargs)
9
+ self.d_model = d_model
10
+ self.num_layers = num_layers
11
+ self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
12
+ self.pos_encoding = self.positional_encoding(max_tokens, d_model)
13
+ self.dec_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate)
14
+ for _ in range(num_layers)]
15
+ self.dropout = tf.keras.layers.Dropout(dropout_rate)
16
+
17
+ def call(self, x, enc_output, training=None, look_ahead_mask=None, padding_mask=None):
18
+ seq_len = tf.shape(x)[1]
19
+ x = self.embedding(x)
20
+ x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
21
+ x += self.pos_encoding[:, :seq_len, :]
22
+ x = self.dropout(x, training=training)
23
+ for i in range(self.num_layers):
24
+ x = self.dec_layers[i](x, enc_output, training=training,
25
+ look_ahead_mask=look_ahead_mask,
26
+ padding_mask=padding_mask)
27
+ return x
28
+
29
+ def positional_encoding(self, max_len, d_model):
30
+ angle_rads = self.get_angles(tf.range(max_len, dtype=tf.float32)[:, tf.newaxis],
31
+ tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
32
+ d_model)
33
+ sines = tf.math.sin(angle_rads[:, 0::2])
34
+ cosines = tf.math.cos(angle_rads[:, 1::2])
35
+ pos_encoding = tf.concat([sines, cosines], axis=-1)
36
+ return pos_encoding[tf.newaxis, ...]
37
+
38
+ def get_angles(self, pos, i, d_model):
39
+ angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
40
+ return pos * angle_rates
41
+
42
+ def get_config(self):
43
+ config = super().get_config()
44
+ config.update({
45
+ 'num_layers': self.num_layers,
46
+ 'd_model': self.d_model,
47
+ 'num_heads': self.num_heads,
48
+ 'dff': self.dff,
49
+ 'target_vocab_size': self.embedding.input_dim,
50
+ 'max_tokens': self.pos_encoding.shape[1],
51
+ 'dropout_rate': self.dropout.rate
52
+ })
53
+ return config
models/encoder.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from .layers import EncoderLayer
3
+
4
+ @tf.keras.utils.register_keras_serializable()
5
+ class Encoder(tf.keras.layers.Layer):
6
+ def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
7
+ max_tokens, dropout_rate, **kwargs):
8
+ super(Encoder, self).__init__(**kwargs)
9
+ self.d_model = d_model
10
+ self.num_layers = num_layers
11
+ self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
12
+ self.pos_encoding = self.positional_encoding(max_tokens, d_model)
13
+ self.enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate)
14
+ for _ in range(num_layers)]
15
+ self.dropout = tf.keras.layers.Dropout(dropout_rate)
16
+
17
+ def call(self, x, training=None, mask=None):
18
+ seq_len = tf.shape(x)[1]
19
+ x = self.embedding(x)
20
+ x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
21
+ x += self.pos_encoding[:, :seq_len, :]
22
+ x = self.dropout(x, training=training)
23
+ for i in range(self.num_layers):
24
+ x = self.enc_layers[i](x, training=training, mask=mask)
25
+ return x
26
+
27
+ def positional_encoding(self, max_len, d_model):
28
+ angle_rads = self.get_angles(tf.range(max_len, dtype=tf.float32)[:, tf.newaxis],
29
+ tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
30
+ d_model)
31
+ sines = tf.math.sin(angle_rads[:, 0::2])
32
+ cosines = tf.math.cos(angle_rads[:, 1::2])
33
+ pos_encoding = tf.concat([sines, cosines], axis=-1)
34
+ return pos_encoding[tf.newaxis, ...]
35
+
36
+ def get_angles(self, pos, i, d_model):
37
+ angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
38
+ return pos * angle_rates
39
+
40
+ def get_config(self):
41
+ config = super().get_config()
42
+ config.update({
43
+ 'num_layers': self.num_layers,
44
+ 'd_model': self.d_model,
45
+ 'num_heads': self.num_heads,
46
+ 'dff': self.dff,
47
+ 'input_vocab_size': self.embedding.input_dim,
48
+ 'max_tokens': self.pos_encoding.shape[1],
49
+ 'dropout_rate': self.dropout.rate
50
+ })
51
+ return config
models/layers.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization
3
+
4
+ @tf.keras.utils.register_keras_serializable()
5
+ class EncoderLayer(Layer):
6
+ def __init__(self, d_model, num_heads, dff, dropout_rate, **kwargs):
7
+ super(EncoderLayer, self).__init__(**kwargs)
8
+ self.mha = MultiHeadAttention(d_model, num_heads)
9
+ self.ffn = point_wise_feed_forward_network(d_model, dff)
10
+ self.layernorm1 = LayerNormalization(epsilon=1e-6)
11
+ self.layernorm2 = LayerNormalization(epsilon=1e-6)
12
+ self.dropout1 = Dropout(dropout_rate)
13
+ self.dropout2 = Dropout(dropout_rate)
14
+
15
+ def call(self, x, training=None, mask=None):
16
+ attn_output, _ = self.mha(x, x, x, mask)
17
+ attn_output = self.dropout1(attn_output, training=training)
18
+ out1 = self.layernorm1(x + attn_output)
19
+ ffn_output = self.ffn(out1)
20
+ ffn_output = self.dropout2(ffn_output, training=training)
21
+ out2 = self.layernorm2(out1 + ffn_output)
22
+ return out2
23
+
24
+ def get_config(self):
25
+ config = super().get_config()
26
+ config.update({
27
+ 'd_model': self.mha.d_model,
28
+ 'num_heads': self.mha.num_heads,
29
+ 'dff': self.ffn.layers[0].units,
30
+ 'dropout_rate': self.dropout1.rate
31
+ })
32
+ return config
33
+
34
+ @tf.keras.utils.register_keras_serializable()
35
+ class DecoderLayer(Layer):
36
+ def __init__(self, d_model, num_heads, dff, dropout_rate, **kwargs):
37
+ super(DecoderLayer, self).__init__(**kwargs)
38
+ self.mha1 = MultiHeadAttention(d_model, num_heads)
39
+ self.mha2 = MultiHeadAttention(d_model, num_heads)
40
+ self.ffn = point_wise_feed_forward_network(d_model, dff)
41
+ self.layernorm1 = LayerNormalization(epsilon=1e-6)
42
+ self.layernorm2 = LayerNormalization(epsilon=1e-6)
43
+ self.layernorm3 = LayerNormalization(epsilon=1e-6)
44
+ self.dropout1 = Dropout(dropout_rate)
45
+ self.dropout2 = Dropout(dropout_rate)
46
+ self.dropout3 = Dropout(dropout_rate)
47
+
48
+ def call(self, x, enc_output, training=None, look_ahead_mask=None, padding_mask=None):
49
+ attn1, _ = self.mha1(x, x, x, look_ahead_mask)
50
+ attn1 = self.dropout1(attn1, training=training)
51
+ out1 = self.layernorm1(x + attn1)
52
+ attn2, _ = self.mha2(enc_output, enc_output, out1, padding_mask)
53
+ attn2 = self.dropout2(attn2, training=training)
54
+ out2 = self.layernorm2(out1 + attn2)
55
+ ffn_output = self.ffn(out2)
56
+ ffn_output = self.dropout3(ffn_output, training=training)
57
+ out3 = self.layernorm3(out2 + ffn_output)
58
+ return out3
59
+
60
+ def get_config(self):
61
+ config = super().get_config()
62
+ config.update({
63
+ 'd_model': self.mha1.d_model,
64
+ 'num_heads': self.mha1.num_heads,
65
+ 'dff': self.ffn.layers[0].units,
66
+ 'dropout_rate': self.dropout1.rate
67
+ })
68
+ return config
69
+
70
+ @tf.keras.utils.register_keras_serializable()
71
+ class MultiHeadAttention(Layer):
72
+ def __init__(self, d_model, num_heads, **kwargs):
73
+ super(MultiHeadAttention, self).__init__(**kwargs)
74
+ self.num_heads = num_heads
75
+ self.d_model = d_model
76
+ assert d_model % num_heads == 0
77
+ self.depth = d_model // num_heads
78
+ self.wq = Dense(d_model)
79
+ self.wk = Dense(d_model)
80
+ self.wv = Dense(d_model)
81
+ self.dense = Dense(d_model)
82
+
83
+ def split_heads(self, x, batch_size):
84
+ x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
85
+ return tf.transpose(x, perm=[0, 2, 1, 3])
86
+
87
+ def call(self, v, k, q, mask=None):
88
+ batch_size = tf.shape(q)[0]
89
+ q = self.wq(q)
90
+ k = self.wk(k)
91
+ v = self.wv(v)
92
+ q = self.split_heads(q, batch_size)
93
+ k = self.split_heads(k, batch_size)
94
+ v = self.split_heads(v, batch_size)
95
+ scaled_attention, _ = self.scaled_dot_product_attention(q, k, v, mask)
96
+ scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
97
+ concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
98
+ output = self.dense(concat_attention)
99
+ return output, _
100
+
101
+ def scaled_dot_product_attention(self, q, k, v, mask):
102
+ matmul_qk = tf.matmul(q, k, transpose_b=True)
103
+ dk = tf.cast(tf.shape(k)[-1], tf.float32)
104
+ scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
105
+ if mask is not None:
106
+ scaled_attention_logits += (mask * -1e9)
107
+ attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
108
+ output = tf.matmul(attention_weights, v)
109
+ return output, attention_weights
110
+
111
+ def get_config(self):
112
+ config = super().get_config()
113
+ config.update({
114
+ 'd_model': self.d_model,
115
+ 'num_heads': self.num_heads
116
+ })
117
+ return config
118
+
119
+ def point_wise_feed_forward_network(d_model, dff):
120
+ return tf.keras.Sequential([
121
+ Dense(dff, activation='relu'),
122
+ Dense(d_model)
123
+ ])
models/transformer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from .encoder import Encoder
3
+ from .decoder import Decoder
4
+ from tensorflow.keras.layers import Dense
5
+
6
+ @tf.keras.utils.register_keras_serializable()
7
+ class Transformer(tf.keras.Model):
8
+ def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
9
+ target_vocab_size, max_tokens, dropout_rate=0.1, **kwargs):
10
+ super(Transformer, self).__init__(**kwargs)
11
+ self.num_layers = num_layers
12
+ self.d_model = d_model
13
+ self.num_heads = num_heads
14
+ self.dff = dff
15
+ self.input_vocab_size = input_vocab_size
16
+ self.target_vocab_size = target_vocab_size
17
+ self.max_tokens = max_tokens
18
+ self.dropout_rate = dropout_rate
19
+
20
+ self.encoder = Encoder(num_layers, d_model, num_heads, dff,
21
+ input_vocab_size, max_tokens, dropout_rate)
22
+ self.decoder = Decoder(num_layers, d_model, num_heads, dff,
23
+ target_vocab_size, max_tokens, dropout_rate)
24
+ self.final_layer = Dense(target_vocab_size)
25
+
26
+ def call(self, inputs, training=None):
27
+ enc_input, dec_input = inputs
28
+ enc_padding_mask = self.create_padding_mask(enc_input)
29
+ look_ahead_mask = self.create_look_ahead_mask(tf.shape(dec_input)[1])
30
+ dec_padding_mask = self.create_padding_mask(enc_input)
31
+ enc_output = self.encoder(enc_input, training=training, mask=enc_padding_mask)
32
+ dec_output = self.decoder(dec_input, enc_output, training=training,
33
+ look_ahead_mask=look_ahead_mask,
34
+ padding_mask=dec_padding_mask)
35
+ final_output = self.final_layer(dec_output)
36
+ return final_output
37
+
38
+ def create_padding_mask(self, seq):
39
+ mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
40
+ return mask[:, tf.newaxis, tf.newaxis, :]
41
+
42
+ def create_look_ahead_mask(self, size):
43
+ mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
44
+ return mask
45
+
46
+ def get_config(self):
47
+ config = super().get_config()
48
+ config.update({
49
+ 'num_layers': self.num_layers,
50
+ 'd_model': self.d_model,
51
+ 'num_heads': self.num_heads,
52
+ 'dff': self.dff,
53
+ 'input_vocab_size': self.input_vocab_size,
54
+ 'target_vocab_size': self.target_vocab_size,
55
+ 'max_tokens': self.max_tokens,
56
+ 'dropout_rate': self.dropout_rate
57
+ })
58
+ return config
models/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+
3
+ @tf.keras.utils.register_keras_serializable()
4
+ def masked_loss(label, pred):
5
+ mask = label != 0
6
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
7
+ from_logits=True, reduction='none')
8
+ loss = loss_object(label, pred)
9
+ mask = tf.cast(mask, dtype=loss.dtype)
10
+ loss *= mask
11
+ loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
12
+ return loss
13
+
14
+ @tf.keras.utils.register_keras_serializable()
15
+ def masked_accuracy(label, pred):
16
+ pred = tf.argmax(pred, axis=2)
17
+ label = tf.cast(label, pred.dtype)
18
+ match = label == pred
19
+ mask = label != 0
20
+ match = match & mask
21
+ match = tf.cast(match, dtype=tf.float32)
22
+ mask = tf.cast(mask, dtype=tf.float32)
23
+ return tf.reduce_sum(match)/tf.reduce_sum(mask)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas==2.2.3
2
+ matplotlib==3.7.5
3
+ tensorflow_datasets==4.9.7
4
+ tensorflow==2.17.1
5
+ tensorflow-text==2.17.0
6
+ underthesea==6.8.4
saved_models/backup_weights/latest.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4e1e9122a5896689ccb4cb169ff3b2edf9e5474655468fd638b68cb31159eb4
3
+ size 368283712
saved_models/backup_weights/training_metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"epoch": 50, "batch": 0}
saved_models/en_vi_translation.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85b96fdac99f5d2b1fb6f6823cb6c5691fd2ff339d8a966f0b0cc6da52cc33fd
3
+ size 368250218
tokenizers/en_tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d172551a4da648a35629de06593b3051ce32598c3166464de46fc4a1f6f90981
3
+ size 3973568
tokenizers/vi_tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a96a2d1b932a52603083dff973a0eba79998c5e2c2717e54f5c6eeaf84b8fa
3
+ size 5835250
translator.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import numpy as np
3
+ from models.transformer import Transformer
4
+ from models.encoder import Encoder
5
+ from models.decoder import Decoder
6
+ from models.layers import EncoderLayer, DecoderLayer, MultiHeadAttention, point_wise_feed_forward_network
7
+ from models.utils import masked_loss, masked_accuracy
8
+
9
+ class Translator(tf.Module):
10
+ def __init__(self, en_tokenizer, vi_tokenizer, transformer, max_tokens=512):
11
+ self.tokenizers_en = en_tokenizer
12
+ self.tokenizers_vi = vi_tokenizer
13
+ self.transformer = transformer
14
+ self.max_tokens = max_tokens
15
+
16
+ def __call__(self, sentence, max_length=None):
17
+ if max_length is None:
18
+ max_length = self.max_tokens
19
+
20
+ sentence = self.tokenizers_en.texts_to_sequences([sentence])
21
+ sentence = sentence[0] + np.zeros(self.max_tokens - len(sentence[0]),
22
+ dtype=np.int32).tolist()
23
+ sentence = tf.convert_to_tensor([sentence])
24
+ encoder_input = sentence
25
+
26
+ start = self.tokenizers_vi.texts_to_sequences(["<sos>"])[0]
27
+ end = self.tokenizers_vi.texts_to_sequences(["<eos>"])[0]
28
+
29
+ output_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
30
+ output_array = output_array.write(0, start)
31
+
32
+ for i in tf.range(max_length):
33
+ output = tf.transpose(output_array.stack())
34
+ predictions = self.transformer([encoder_input, output], training=False)
35
+ predictions = predictions[:, -1:, :]
36
+ predicted_id = tf.argmax(predictions, axis=-1, output_type=tf.int32)
37
+ output_array = output_array.write(i+1, predicted_id[0])
38
+
39
+ if predicted_id == end:
40
+ break
41
+
42
+ output = tf.transpose(output_array.stack())
43
+ text = self.tokenizers_vi.sequences_to_texts(output.numpy().tolist())[0]
44
+ return text
utils/__pycache__/preprocessing.cpython-311.pyc ADDED
Binary file (1.17 kB). View file
 
utils/__pycache__/tokenizer_utils.cpython-311.pyc ADDED
Binary file (1.28 kB). View file
 
utils/preprocessing.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ def input_processing(in_string):
4
+
5
+ punct_marks = string.punctuation
6
+ for mark in punct_marks:
7
+ if mark in in_string:
8
+ in_string = in_string.replace(mark, " " + mark)
9
+ in_string = in_string.replace("scholl", "school")
10
+ in_string = in_string.strip()
11
+
12
+ return in_string
13
+
14
+ def output_processing(in_string):
15
+
16
+ in_string = in_string.replace("_", " ").replace('<sos>', '').replace('<eos>', '').strip()
17
+ in_string = in_string[0].upper() + in_string[1:]
18
+
19
+ return in_string
utils/tokenizer_utils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import tensorflow as tf
3
+
4
+ def load_tokenizers(en_path='tokenizers/en_tokenizer.pkl',
5
+ vi_path='tokenizers/vi_tokenizer.pkl'):
6
+ with open(en_path, 'rb') as f:
7
+ en_tokenizer = pickle.load(f)
8
+ with open(vi_path, 'rb') as f:
9
+ vi_tokenizer = pickle.load(f)
10
+
11
+ en_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(en_tokenizer)
12
+ vi_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(vi_tokenizer)
13
+
14
+ return en_tokenizer, vi_tokenizer