Spaces:
Paused
Paused
Rishi Desai
commited on
Commit
·
a446ad0
1
Parent(s):
c9dac35
using together ai now
Browse files- caption.py +34 -13
caption.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import base64
|
| 2 |
import io
|
| 3 |
import os
|
| 4 |
-
from
|
| 5 |
from PIL import Image
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
|
|
@@ -41,7 +41,7 @@ Captioning Principles:
|
|
| 41 |
- Avoid mentioning real or fictional identities.
|
| 42 |
- Always prefix with the trigger word "tr1gger."
|
| 43 |
|
| 44 |
-
|
| 45 |
- tr1gger photorealistic, tactical vest and gloves, standing in profile, neutral expression, overcast lighting, side profile
|
| 46 |
- tr1gger 3D-rendered, hooded cloak with digital pattern, seated cross-legged, calm expression, low ambient lighting, front view
|
| 47 |
- tr1gger anime-style, school uniform with blue necktie, standing with arms behind back, gentle smile, soft daylight, three-quarter view
|
|
@@ -57,7 +57,13 @@ def caption_images(images):
|
|
| 57 |
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 58 |
image_strings.append(img_str)
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
captions = []
|
| 62 |
|
| 63 |
# Start a separate chat session for each image
|
|
@@ -65,31 +71,40 @@ def caption_images(images):
|
|
| 65 |
messages = [
|
| 66 |
{"role": "system", "content": get_prompt()},
|
| 67 |
{
|
| 68 |
-
"role": "user",
|
| 69 |
"content": [
|
| 70 |
-
{"type": "
|
| 71 |
-
{"type": "
|
| 72 |
]
|
| 73 |
}
|
| 74 |
]
|
| 75 |
|
| 76 |
-
# Request caption for the image
|
| 77 |
response = client.chat.completions.create(
|
| 78 |
-
model="
|
| 79 |
-
messages=messages
|
| 80 |
-
max_tokens=512 # Adjust max_tokens as needed
|
| 81 |
)
|
| 82 |
|
| 83 |
# Extract caption from the response
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
captions.append(caption)
|
| 86 |
|
| 87 |
return captions
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
# Example usage
|
| 90 |
if __name__ == "__main__":
|
| 91 |
-
if not os.environ.get("
|
| 92 |
-
print("Please update the
|
| 93 |
exit(1)
|
| 94 |
|
| 95 |
# Load images
|
|
@@ -100,3 +115,9 @@ if __name__ == "__main__":
|
|
| 100 |
captions = caption_images(images)
|
| 101 |
for i, caption in enumerate(captions):
|
| 102 |
print(f"Generated Caption for Image {i+1}: {caption}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import base64
|
| 2 |
import io
|
| 3 |
import os
|
| 4 |
+
from together import Together
|
| 5 |
from PIL import Image
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
|
|
|
|
| 41 |
- Avoid mentioning real or fictional identities.
|
| 42 |
- Always prefix with the trigger word "tr1gger."
|
| 43 |
|
| 44 |
+
Examples:
|
| 45 |
- tr1gger photorealistic, tactical vest and gloves, standing in profile, neutral expression, overcast lighting, side profile
|
| 46 |
- tr1gger 3D-rendered, hooded cloak with digital pattern, seated cross-legged, calm expression, low ambient lighting, front view
|
| 47 |
- tr1gger anime-style, school uniform with blue necktie, standing with arms behind back, gentle smile, soft daylight, three-quarter view
|
|
|
|
| 57 |
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 58 |
image_strings.append(img_str)
|
| 59 |
|
| 60 |
+
# Retrieve the API key from the environment
|
| 61 |
+
api_key = os.environ.get("TOGETHER_API_KEY")
|
| 62 |
+
if not api_key:
|
| 63 |
+
raise ValueError("TOGETHER_API_KEY is not set in the environment.")
|
| 64 |
+
|
| 65 |
+
# Pass the API key to the Together client
|
| 66 |
+
client = Together(api_key=api_key)
|
| 67 |
captions = []
|
| 68 |
|
| 69 |
# Start a separate chat session for each image
|
|
|
|
| 71 |
messages = [
|
| 72 |
{"role": "system", "content": get_prompt()},
|
| 73 |
{
|
| 74 |
+
"role": "user",
|
| 75 |
"content": [
|
| 76 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}"}},
|
| 77 |
+
{"type": "text", "text": "Describe this image."}
|
| 78 |
]
|
| 79 |
}
|
| 80 |
]
|
| 81 |
|
| 82 |
+
# Request caption for the image using Llama 4 Maverick
|
| 83 |
response = client.chat.completions.create(
|
| 84 |
+
model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
| 85 |
+
messages=messages
|
|
|
|
| 86 |
)
|
| 87 |
|
| 88 |
# Extract caption from the response
|
| 89 |
+
full_response = response.choices[0].message.content.strip()
|
| 90 |
+
# Post-process to extract only the caption part
|
| 91 |
+
caption = next((line for line in full_response.splitlines() if line.startswith("tr1gger")), "")
|
| 92 |
captions.append(caption)
|
| 93 |
|
| 94 |
return captions
|
| 95 |
|
| 96 |
+
def extract_captions(file_path):
|
| 97 |
+
captions = []
|
| 98 |
+
with open(file_path, 'r') as file:
|
| 99 |
+
for line in file:
|
| 100 |
+
if line.startswith("tr1gger"):
|
| 101 |
+
captions.append(line.strip())
|
| 102 |
+
return captions
|
| 103 |
+
|
| 104 |
# Example usage
|
| 105 |
if __name__ == "__main__":
|
| 106 |
+
if not os.environ.get("TOGETHER_API_KEY"):
|
| 107 |
+
print("Please update the environment with your Together AI API key.")
|
| 108 |
exit(1)
|
| 109 |
|
| 110 |
# Load images
|
|
|
|
| 115 |
captions = caption_images(images)
|
| 116 |
for i, caption in enumerate(captions):
|
| 117 |
print(f"Generated Caption for Image {i+1}: {caption}")
|
| 118 |
+
|
| 119 |
+
# Extract captions from a file
|
| 120 |
+
file_path = 'post_girl/multiview_0.txt'
|
| 121 |
+
extracted_captions = extract_captions(file_path)
|
| 122 |
+
for caption in extracted_captions:
|
| 123 |
+
print(caption)
|