NatureLM-Audio / data_store.py
gagannarula's picture
data studio got confused by original_file_name
71ceb2a verified
import os
from pathlib import Path
import uuid
import json
from huggingface_hub import HfApi, HfFileSystem
DATASET_REPO = "EarthSpeciesProject/naturelm-audio-space-logs"
SPLIT = "test"
TESTING = os.getenv("TESTING", "0") == "1"
api = HfApi(token=os.getenv("HF_TOKEN",None))
# Upload audio
# check if file exists
hf_fs = HfFileSystem(token=os.getenv("HF_TOKEN",None))
def upload_data(audio: str | Path, user_text: str, model_response: str):
data_id = str(uuid.uuid4())
if TESTING:
data_id = "test-" + data_id
# Audio path in repo
suffix = Path(audio).suffix
audio_p = f"{SPLIT}/audio/" + data_id + suffix
api.upload_file(
path_or_fileobj=str(audio),
path_in_repo=audio_p,
repo_id=DATASET_REPO,
repo_type="dataset",
)
text = {
"user_message": user_text,
"model_response": model_response,
"file_name": "audio/" + data_id + suffix, # has to be relative to metadata.jsonl
"original_fn": os.path.basename(audio),
"id": data_id,
}
# Append to a jsonl file in the repo
# APPEND DOESNT WORK, have to open first
if hf_fs.exists(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl"):
with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "r") as f:
lines = f.readlines()
lines.append(json.dumps(text) + "\n")
with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
f.writelines(lines)
else:
with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
f.write(json.dumps(text) + "\n")
# Write a separate file instead
# with hf_fs.open(f"datasets/{DATASET_REPO}/{data_id}.json", "w") as f:
# json.dump(text, f)