Spaces:
Sleeping
Sleeping
Ziad Meligy
commited on
Commit
·
36a5d2b
1
Parent(s):
eb8805a
Pushing deployment to space
Browse files- requirements.txt +13 -0
- tags.py +1 -0
- tokenizer_wrapper.py +110 -0
- utility.py +64 -0
- utils.py +39 -0
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
torch
|
| 3 |
+
pandas
|
| 4 |
+
numpy
|
| 5 |
+
matplotlib
|
| 6 |
+
tiktoken
|
| 7 |
+
tensorflow
|
| 8 |
+
gensim
|
| 9 |
+
torchvision
|
| 10 |
+
scikit-image
|
| 11 |
+
pydicom
|
| 12 |
+
fastapi
|
| 13 |
+
python-multipart
|
tags.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
tags = ['normal', 'right', 'lung', 'calcified granuloma', 'upper lobe', 'lingula', 'opacity', 'pulmonary atelectasis', 'interstitial', 'bilateral', 'diffuse', 'markings', 'prominent', 'left', 'density', 'retrocardiac', 'metabolic', 'spine', 'calcinosis', 'base', 'bone diseases', 'tortuous', 'indwelling', 'degenerative', 'aorta', 'catheters', 'thoracic vertebrae', 'mild', 'cardiomegaly', 'severe', 'diaphragm', 'elevated', 'hypoinflation', 'pulmonary congestion', 'technical quality of image unsatisfactory', 'chronic', 'pleural effusion', 'consolidation', 'costophrenic angle', 'airspace disease', 'blunted', 'surgical instruments', 'implanted medical device', 'patchy', 'streaky', 'pleura', 'thickening', 'focal', 'cicatrix', 'hilum', 'lower lobe', 'round', 'small', 'hyperdistention', 'mediastinum', 'nodule', 'no indexing', 'posterior', 'obscured', 'scoliosis', 'bronchovascular', 'granulomatous disease', 'multiple', 'osteophyte', 'middle lobe', 'hernia', 'hiatal', 'thoracic', 'pulmonary emphysema', 'lymph nodes', 'atherosclerosis', 'deformity', 'anterior', 'ribs', 'lucency', 'scattered', 'lumbar vertebrae', 'flattened', 'spondylosis', 'bone', 'borderline', 'fractures', 'thorax', 'healed', 'kyphosis', 'chronic obstructive', 'emphysema', 'pulmonary disease', 'infiltrate', 'pulmonary edema', 'moderate', 'enlarged', 'cardiac shadow', 'foreign bodies', 'spinal fusion', 'apex', 'diaphragmatic eventration', 'arthritis', 'pneumonia', 'abdomen', 'large', 'tube', 'inserted', 'paratracheal', 'granuloma']
|
tokenizer_wrapper.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
|
| 3 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
|
| 4 |
+
from tensorflow.keras.preprocessing.text import text_to_word_sequence # type: ignore
|
| 5 |
+
import numpy as np
|
| 6 |
+
from transformers import GPT2Tokenizer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TokenizerWrapper:
|
| 10 |
+
def __init__(self, class_name, max_caption_length, tokenizer_num_words=None):
|
| 11 |
+
# dataset_df = pd.read_csv(dataset_csv_file)
|
| 12 |
+
# sentences = dataset_df[class_name].tolist()
|
| 13 |
+
self.max_caption_length = max_caption_length
|
| 14 |
+
self.tokenizer_num_words = tokenizer_num_words
|
| 15 |
+
# self.init_tokenizer(sentences)
|
| 16 |
+
self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_prefix_space=True)
|
| 17 |
+
self.gpt2_tokenizer.pad_token = "<"
|
| 18 |
+
|
| 19 |
+
def clean_sentence(self, sentence):
|
| 20 |
+
return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
|
| 21 |
+
|
| 22 |
+
def GPT2_pad_token_id(self):
|
| 23 |
+
return self.gpt2_tokenizer.pad_token_id
|
| 24 |
+
|
| 25 |
+
def GPT2_eos_token_id(self):
|
| 26 |
+
return self.gpt2_tokenizer.eos_token_id
|
| 27 |
+
|
| 28 |
+
def GPT2_encode(self, sentences, pad=True, max_length=None):
|
| 29 |
+
if max_length is None:
|
| 30 |
+
max_length = self.max_caption_length
|
| 31 |
+
if isinstance(sentences, str):
|
| 32 |
+
return self.gpt2_tokenizer.encode(sentences, add_special_tokens=True, max_length=max_length,
|
| 33 |
+
pad_to_max_length=pad)
|
| 34 |
+
tokens = np.zeros((sentences.shape[0], max_length), dtype=int)
|
| 35 |
+
|
| 36 |
+
for i in range(len(sentences)):
|
| 37 |
+
if pd.isna(sentences[i]):
|
| 38 |
+
sentences[i][0] = ""
|
| 39 |
+
sentence = sentences[i][0].lower()
|
| 40 |
+
sentence = sentence.replace('"', '')
|
| 41 |
+
sentence = sentence.replace('xxxx', '')
|
| 42 |
+
sentence = sentence.replace('endseq', '<|endoftext|>')
|
| 43 |
+
tokens[i] = self.gpt2_tokenizer.encode(sentence, add_special_tokens=True,
|
| 44 |
+
max_length=max_length, pad_to_max_length=pad)
|
| 45 |
+
return tokens
|
| 46 |
+
|
| 47 |
+
def GPT2_decode(self, tokens):
|
| 48 |
+
return self.gpt2_tokenizer.decode(tokens, skip_special_tokens=True)
|
| 49 |
+
|
| 50 |
+
def GPT2_format_output(self, sentence):
|
| 51 |
+
sentence = self.clean_sentence(sentence)
|
| 52 |
+
return sentence
|
| 53 |
+
|
| 54 |
+
def filter_special_words(self, sentence):
|
| 55 |
+
sentence = sentence.replace('startseq', '')
|
| 56 |
+
sentence = sentence.replace('endseq', '')
|
| 57 |
+
sentence = sentence.replace('<|endoftext|>', '')
|
| 58 |
+
sentence = sentence.replace('<', '')
|
| 59 |
+
sentence = sentence.strip()
|
| 60 |
+
return sentence
|
| 61 |
+
|
| 62 |
+
def init_tokenizer(self, sentences):
|
| 63 |
+
|
| 64 |
+
for i in range(len(sentences)):
|
| 65 |
+
if pd.isna(sentences[i]):
|
| 66 |
+
sentences[i] = ""
|
| 67 |
+
sentences[i] = self.clean_sentence(sentences[i])
|
| 68 |
+
self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words)
|
| 69 |
+
self.tokenizer.fit_on_texts(sentences) # give each word a unique id
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def get_tokenizer_num_words(self):
|
| 73 |
+
return self.tokenizer_num_words
|
| 74 |
+
|
| 75 |
+
def get_token_of_word(self, word):
|
| 76 |
+
return self.tokenizer.word_index[word]
|
| 77 |
+
|
| 78 |
+
def get_word_from_token(self, token):
|
| 79 |
+
try:
|
| 80 |
+
return self.tokenizer.index_word[token]
|
| 81 |
+
except:
|
| 82 |
+
return ""
|
| 83 |
+
|
| 84 |
+
def get_sentence_from_tokens(self, tokens):
|
| 85 |
+
sentence = []
|
| 86 |
+
for token in tokens[0]:
|
| 87 |
+
word = self.get_word_from_token(token)
|
| 88 |
+
if word == 'endseq':
|
| 89 |
+
return sentence
|
| 90 |
+
if word != 'startseq':
|
| 91 |
+
sentence.append(word)
|
| 92 |
+
|
| 93 |
+
return sentence
|
| 94 |
+
|
| 95 |
+
def get_string_from_word_list(self, word_list):
|
| 96 |
+
|
| 97 |
+
return " ".join(word_list)
|
| 98 |
+
|
| 99 |
+
def get_word_tokens_list(self):
|
| 100 |
+
return self.tokenizer.word_index
|
| 101 |
+
|
| 102 |
+
def tokenize_sentences(self, sentences):
|
| 103 |
+
index = 0
|
| 104 |
+
tokenized_sentences = np.zeros((sentences.shape[0], self.max_caption_length), dtype=int)
|
| 105 |
+
for caption in sentences:
|
| 106 |
+
tokenized_caption = self.tokenizer.texts_to_sequences([self.clean_sentence(caption[0])])
|
| 107 |
+
tokenized_sentences[index] = pad_sequences(tokenized_caption, maxlen=self.max_caption_length,
|
| 108 |
+
padding='post') # padded with max length
|
| 109 |
+
index = index + 1
|
| 110 |
+
return tokenized_sentences
|
utility.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
from torch.utils.data import DataLoader
|
| 7 |
+
from generator import AugmentedImageSequence
|
| 8 |
+
import torch.optim as optim
|
| 9 |
+
from tensorflow.keras.models import model_from_json # type: ignore
|
| 10 |
+
|
| 11 |
+
def get_dataloader(csv, batch_size, FLAGS, tokenizer_wrapper, augmenter=None):
|
| 12 |
+
"""
|
| 13 |
+
Replaces the TensorFlow enqueuer with PyTorch DataLoader.
|
| 14 |
+
"""
|
| 15 |
+
data_generator = AugmentedImageSequence(
|
| 16 |
+
dataset_csv_file=csv,
|
| 17 |
+
class_names=FLAGS.csv_label_columns,
|
| 18 |
+
tokenizer_wrapper=tokenizer_wrapper,
|
| 19 |
+
source_image_dir=FLAGS.image_directory,
|
| 20 |
+
batch_size=batch_size,
|
| 21 |
+
target_size=FLAGS.image_target_size,
|
| 22 |
+
augmenter=augmenter,
|
| 23 |
+
shuffle_on_epoch_end=True,
|
| 24 |
+
)
|
| 25 |
+
dataloader = DataLoader(data_generator, shuffle=True, num_workers=0)
|
| 26 |
+
return dataloader, data_generator.steps
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_layers(layer_sizes, activation='relu'):
|
| 30 |
+
"""
|
| 31 |
+
Builds a list of layers in PyTorch based on specified sizes.
|
| 32 |
+
Dropout layers are specified with values < 1, Dense (Linear) layers otherwise.
|
| 33 |
+
"""
|
| 34 |
+
layers = []
|
| 35 |
+
activation_fn = getattr(nn, activation.capitalize(), nn.ReLU) # Set default activation to ReLU if none specified
|
| 36 |
+
for layer_size in layer_sizes:
|
| 37 |
+
if layer_size < 1:
|
| 38 |
+
layers.append(nn.Dropout(layer_size))
|
| 39 |
+
else:
|
| 40 |
+
layers.append(nn.Linear(in_features=layer_size, out_features=layer_size))
|
| 41 |
+
layers.append(activation_fn())
|
| 42 |
+
return nn.Sequential(*layers) # Return as a sequential module for easy stacking
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_optimizer(optimizer_type, learning_rate, lr_decay=0):
|
| 47 |
+
optimizer_class = getattr(optim, optimizer_type)
|
| 48 |
+
dummy_param = torch.nn.Parameter(torch.empty(0))
|
| 49 |
+
optimizer = optimizer_class(params = [dummy_param], lr=learning_rate, weight_decay=lr_decay)
|
| 50 |
+
return optimizer
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def load_model(load_path, model_name):
|
| 54 |
+
path = os.path.join(load_path, model_name)
|
| 55 |
+
|
| 56 |
+
# load json and create model
|
| 57 |
+
json_file = open('{}.json'.format(path), 'r')
|
| 58 |
+
loaded_model_json = json_file.read()
|
| 59 |
+
json_file.close()
|
| 60 |
+
loaded_model = model_from_json(loaded_model_json)
|
| 61 |
+
# # load weights into new model
|
| 62 |
+
loaded_model.load_weights("{}.h5".format(path))
|
| 63 |
+
print("Loaded model from disk")
|
| 64 |
+
return loaded_model
|
utils.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image
|
| 2 |
+
import io
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pydicom
|
| 5 |
+
import torch
|
| 6 |
+
from fastapi import HTTPException, UploadFile
|
| 7 |
+
from skimage.transform import resize
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_image(image):
|
| 11 |
+
image = image.convert("RGB")
|
| 12 |
+
image_array = np.asarray(image) / 255.0 # Normalize to [0,1]
|
| 13 |
+
image_array = resize(image_array, (224, 224))
|
| 14 |
+
image_tensor = torch.tensor(image_array, dtype=torch.float32).permute(2, 0, 1) # CxHxW
|
| 15 |
+
|
| 16 |
+
mean = torch.tensor([0.485, 0.456, 0.406])
|
| 17 |
+
std = torch.tensor([0.229, 0.224, 0.225])
|
| 18 |
+
image_tensor = (image_tensor - mean[:, None, None]) / std[:, None, None]
|
| 19 |
+
|
| 20 |
+
return image_tensor.unsqueeze(0) # Add batch dimension
|
| 21 |
+
|
| 22 |
+
async def convert_to_png(file: UploadFile) -> Image.Image:
|
| 23 |
+
"""Converts JPG, PNG, or DICOM to a PNG format"""
|
| 24 |
+
image_data = await file.read()
|
| 25 |
+
if file.content_type in ["image/jpeg", "image/png", "image/jpg"]:
|
| 26 |
+
image = Image.open(io.BytesIO(image_data))
|
| 27 |
+
return image
|
| 28 |
+
|
| 29 |
+
if file.content_type == "application/dicom" or file.filename.endswith(".dcm") or file.filename.endswith(".dicom"):
|
| 30 |
+
dicom_data = pydicom.dcmread(io.BytesIO(image_data))
|
| 31 |
+
pixel_array = dicom_data.pixel_array
|
| 32 |
+
|
| 33 |
+
if pixel_array.dtype != np.uint8:
|
| 34 |
+
pixel_array = (pixel_array / pixel_array.max() * 255).astype(np.uint8)
|
| 35 |
+
|
| 36 |
+
image = Image.fromarray(pixel_array).convert("RGB")
|
| 37 |
+
return image
|
| 38 |
+
|
| 39 |
+
raise HTTPException(status_code=400, detail="Unsupported media type")
|