Ziad Meligy commited on
Commit
36a5d2b
·
1 Parent(s): eb8805a

Pushing deployment to space

Browse files
Files changed (5) hide show
  1. requirements.txt +13 -0
  2. tags.py +1 -0
  3. tokenizer_wrapper.py +110 -0
  4. utility.py +64 -0
  5. utils.py +39 -0
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ pandas
4
+ numpy
5
+ matplotlib
6
+ tiktoken
7
+ tensorflow
8
+ gensim
9
+ torchvision
10
+ scikit-image
11
+ pydicom
12
+ fastapi
13
+ python-multipart
tags.py ADDED
@@ -0,0 +1 @@
 
 
1
+ tags = ['normal', 'right', 'lung', 'calcified granuloma', 'upper lobe', 'lingula', 'opacity', 'pulmonary atelectasis', 'interstitial', 'bilateral', 'diffuse', 'markings', 'prominent', 'left', 'density', 'retrocardiac', 'metabolic', 'spine', 'calcinosis', 'base', 'bone diseases', 'tortuous', 'indwelling', 'degenerative', 'aorta', 'catheters', 'thoracic vertebrae', 'mild', 'cardiomegaly', 'severe', 'diaphragm', 'elevated', 'hypoinflation', 'pulmonary congestion', 'technical quality of image unsatisfactory', 'chronic', 'pleural effusion', 'consolidation', 'costophrenic angle', 'airspace disease', 'blunted', 'surgical instruments', 'implanted medical device', 'patchy', 'streaky', 'pleura', 'thickening', 'focal', 'cicatrix', 'hilum', 'lower lobe', 'round', 'small', 'hyperdistention', 'mediastinum', 'nodule', 'no indexing', 'posterior', 'obscured', 'scoliosis', 'bronchovascular', 'granulomatous disease', 'multiple', 'osteophyte', 'middle lobe', 'hernia', 'hiatal', 'thoracic', 'pulmonary emphysema', 'lymph nodes', 'atherosclerosis', 'deformity', 'anterior', 'ribs', 'lucency', 'scattered', 'lumbar vertebrae', 'flattened', 'spondylosis', 'bone', 'borderline', 'fractures', 'thorax', 'healed', 'kyphosis', 'chronic obstructive', 'emphysema', 'pulmonary disease', 'infiltrate', 'pulmonary edema', 'moderate', 'enlarged', 'cardiac shadow', 'foreign bodies', 'spinal fusion', 'apex', 'diaphragmatic eventration', 'arthritis', 'pneumonia', 'abdomen', 'large', 'tube', 'inserted', 'paratracheal', 'granuloma']
tokenizer_wrapper.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
3
+ from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
4
+ from tensorflow.keras.preprocessing.text import text_to_word_sequence # type: ignore
5
+ import numpy as np
6
+ from transformers import GPT2Tokenizer
7
+
8
+
9
+ class TokenizerWrapper:
10
+ def __init__(self, class_name, max_caption_length, tokenizer_num_words=None):
11
+ # dataset_df = pd.read_csv(dataset_csv_file)
12
+ # sentences = dataset_df[class_name].tolist()
13
+ self.max_caption_length = max_caption_length
14
+ self.tokenizer_num_words = tokenizer_num_words
15
+ # self.init_tokenizer(sentences)
16
+ self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_prefix_space=True)
17
+ self.gpt2_tokenizer.pad_token = "<"
18
+
19
+ def clean_sentence(self, sentence):
20
+ return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
21
+
22
+ def GPT2_pad_token_id(self):
23
+ return self.gpt2_tokenizer.pad_token_id
24
+
25
+ def GPT2_eos_token_id(self):
26
+ return self.gpt2_tokenizer.eos_token_id
27
+
28
+ def GPT2_encode(self, sentences, pad=True, max_length=None):
29
+ if max_length is None:
30
+ max_length = self.max_caption_length
31
+ if isinstance(sentences, str):
32
+ return self.gpt2_tokenizer.encode(sentences, add_special_tokens=True, max_length=max_length,
33
+ pad_to_max_length=pad)
34
+ tokens = np.zeros((sentences.shape[0], max_length), dtype=int)
35
+
36
+ for i in range(len(sentences)):
37
+ if pd.isna(sentences[i]):
38
+ sentences[i][0] = ""
39
+ sentence = sentences[i][0].lower()
40
+ sentence = sentence.replace('"', '')
41
+ sentence = sentence.replace('xxxx', '')
42
+ sentence = sentence.replace('endseq', '<|endoftext|>')
43
+ tokens[i] = self.gpt2_tokenizer.encode(sentence, add_special_tokens=True,
44
+ max_length=max_length, pad_to_max_length=pad)
45
+ return tokens
46
+
47
+ def GPT2_decode(self, tokens):
48
+ return self.gpt2_tokenizer.decode(tokens, skip_special_tokens=True)
49
+
50
+ def GPT2_format_output(self, sentence):
51
+ sentence = self.clean_sentence(sentence)
52
+ return sentence
53
+
54
+ def filter_special_words(self, sentence):
55
+ sentence = sentence.replace('startseq', '')
56
+ sentence = sentence.replace('endseq', '')
57
+ sentence = sentence.replace('<|endoftext|>', '')
58
+ sentence = sentence.replace('<', '')
59
+ sentence = sentence.strip()
60
+ return sentence
61
+
62
+ def init_tokenizer(self, sentences):
63
+
64
+ for i in range(len(sentences)):
65
+ if pd.isna(sentences[i]):
66
+ sentences[i] = ""
67
+ sentences[i] = self.clean_sentence(sentences[i])
68
+ self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words)
69
+ self.tokenizer.fit_on_texts(sentences) # give each word a unique id
70
+
71
+
72
+ def get_tokenizer_num_words(self):
73
+ return self.tokenizer_num_words
74
+
75
+ def get_token_of_word(self, word):
76
+ return self.tokenizer.word_index[word]
77
+
78
+ def get_word_from_token(self, token):
79
+ try:
80
+ return self.tokenizer.index_word[token]
81
+ except:
82
+ return ""
83
+
84
+ def get_sentence_from_tokens(self, tokens):
85
+ sentence = []
86
+ for token in tokens[0]:
87
+ word = self.get_word_from_token(token)
88
+ if word == 'endseq':
89
+ return sentence
90
+ if word != 'startseq':
91
+ sentence.append(word)
92
+
93
+ return sentence
94
+
95
+ def get_string_from_word_list(self, word_list):
96
+
97
+ return " ".join(word_list)
98
+
99
+ def get_word_tokens_list(self):
100
+ return self.tokenizer.word_index
101
+
102
+ def tokenize_sentences(self, sentences):
103
+ index = 0
104
+ tokenized_sentences = np.zeros((sentences.shape[0], self.max_caption_length), dtype=int)
105
+ for caption in sentences:
106
+ tokenized_caption = self.tokenizer.texts_to_sequences([self.clean_sentence(caption[0])])
107
+ tokenized_sentences[index] = pad_sequences(tokenized_caption, maxlen=self.max_caption_length,
108
+ padding='post') # padded with max length
109
+ index = index + 1
110
+ return tokenized_sentences
utility.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import pandas as pd
4
+ import torch
5
+ import torch.nn as nn
6
+ from torch.utils.data import DataLoader
7
+ from generator import AugmentedImageSequence
8
+ import torch.optim as optim
9
+ from tensorflow.keras.models import model_from_json # type: ignore
10
+
11
+ def get_dataloader(csv, batch_size, FLAGS, tokenizer_wrapper, augmenter=None):
12
+ """
13
+ Replaces the TensorFlow enqueuer with PyTorch DataLoader.
14
+ """
15
+ data_generator = AugmentedImageSequence(
16
+ dataset_csv_file=csv,
17
+ class_names=FLAGS.csv_label_columns,
18
+ tokenizer_wrapper=tokenizer_wrapper,
19
+ source_image_dir=FLAGS.image_directory,
20
+ batch_size=batch_size,
21
+ target_size=FLAGS.image_target_size,
22
+ augmenter=augmenter,
23
+ shuffle_on_epoch_end=True,
24
+ )
25
+ dataloader = DataLoader(data_generator, shuffle=True, num_workers=0)
26
+ return dataloader, data_generator.steps
27
+
28
+
29
+ def get_layers(layer_sizes, activation='relu'):
30
+ """
31
+ Builds a list of layers in PyTorch based on specified sizes.
32
+ Dropout layers are specified with values < 1, Dense (Linear) layers otherwise.
33
+ """
34
+ layers = []
35
+ activation_fn = getattr(nn, activation.capitalize(), nn.ReLU) # Set default activation to ReLU if none specified
36
+ for layer_size in layer_sizes:
37
+ if layer_size < 1:
38
+ layers.append(nn.Dropout(layer_size))
39
+ else:
40
+ layers.append(nn.Linear(in_features=layer_size, out_features=layer_size))
41
+ layers.append(activation_fn())
42
+ return nn.Sequential(*layers) # Return as a sequential module for easy stacking
43
+
44
+
45
+
46
+ def get_optimizer(optimizer_type, learning_rate, lr_decay=0):
47
+ optimizer_class = getattr(optim, optimizer_type)
48
+ dummy_param = torch.nn.Parameter(torch.empty(0))
49
+ optimizer = optimizer_class(params = [dummy_param], lr=learning_rate, weight_decay=lr_decay)
50
+ return optimizer
51
+
52
+
53
+ def load_model(load_path, model_name):
54
+ path = os.path.join(load_path, model_name)
55
+
56
+ # load json and create model
57
+ json_file = open('{}.json'.format(path), 'r')
58
+ loaded_model_json = json_file.read()
59
+ json_file.close()
60
+ loaded_model = model_from_json(loaded_model_json)
61
+ # # load weights into new model
62
+ loaded_model.load_weights("{}.h5".format(path))
63
+ print("Loaded model from disk")
64
+ return loaded_model
utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import io
3
+ import numpy as np
4
+ import pydicom
5
+ import torch
6
+ from fastapi import HTTPException, UploadFile
7
+ from skimage.transform import resize
8
+
9
+
10
+ def load_image(image):
11
+ image = image.convert("RGB")
12
+ image_array = np.asarray(image) / 255.0 # Normalize to [0,1]
13
+ image_array = resize(image_array, (224, 224))
14
+ image_tensor = torch.tensor(image_array, dtype=torch.float32).permute(2, 0, 1) # CxHxW
15
+
16
+ mean = torch.tensor([0.485, 0.456, 0.406])
17
+ std = torch.tensor([0.229, 0.224, 0.225])
18
+ image_tensor = (image_tensor - mean[:, None, None]) / std[:, None, None]
19
+
20
+ return image_tensor.unsqueeze(0) # Add batch dimension
21
+
22
+ async def convert_to_png(file: UploadFile) -> Image.Image:
23
+ """Converts JPG, PNG, or DICOM to a PNG format"""
24
+ image_data = await file.read()
25
+ if file.content_type in ["image/jpeg", "image/png", "image/jpg"]:
26
+ image = Image.open(io.BytesIO(image_data))
27
+ return image
28
+
29
+ if file.content_type == "application/dicom" or file.filename.endswith(".dcm") or file.filename.endswith(".dicom"):
30
+ dicom_data = pydicom.dcmread(io.BytesIO(image_data))
31
+ pixel_array = dicom_data.pixel_array
32
+
33
+ if pixel_array.dtype != np.uint8:
34
+ pixel_array = (pixel_array / pixel_array.max() * 255).astype(np.uint8)
35
+
36
+ image = Image.fromarray(pixel_array).convert("RGB")
37
+ return image
38
+
39
+ raise HTTPException(status_code=400, detail="Unsupported media type")