Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| from pathlib import Path | |
| import uuid | |
| import json | |
| from huggingface_hub import HfApi, HfFileSystem | |
| DATASET_REPO = "EarthSpeciesProject/naturelm-audio-space-logs" | |
| SPLIT = "test" | |
| TESTING = os.getenv("TESTING", "0") == "1" | |
| api = HfApi(token=os.getenv("HF_TOKEN",None)) | |
| # Upload audio | |
| # check if file exists | |
| hf_fs = HfFileSystem(token=os.getenv("HF_TOKEN",None)) | |
| def upload_data(audio: str | Path, user_text: str, model_response: str): | |
| data_id = str(uuid.uuid4()) | |
| if TESTING: | |
| data_id = "test-" + data_id | |
| # Audio path in repo | |
| suffix = Path(audio).suffix | |
| audio_p = f"{SPLIT}/audio/" + data_id + suffix | |
| api.upload_file( | |
| path_or_fileobj=str(audio), | |
| path_in_repo=audio_p, | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset", | |
| ) | |
| text = { | |
| "user_message": user_text, | |
| "model_response": model_response, | |
| "file_name": "audio/" + data_id + suffix, # has to be relative to metadata.jsonl | |
| "original_fn": os.path.basename(audio), | |
| "id": data_id, | |
| } | |
| # Append to a jsonl file in the repo | |
| # APPEND DOESNT WORK, have to open first | |
| if hf_fs.exists(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl"): | |
| with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "r") as f: | |
| lines = f.readlines() | |
| lines.append(json.dumps(text) + "\n") | |
| with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f: | |
| f.writelines(lines) | |
| else: | |
| with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f: | |
| f.write(json.dumps(text) + "\n") | |
| # Write a separate file instead | |
| # with hf_fs.open(f"datasets/{DATASET_REPO}/{data_id}.json", "w") as f: | |
| # json.dump(text, f) | |