UTAustin-SwarmLab · adihebbalae · Feb 23, 2026 · Feb 23, 2026 · Feb 25, 2026 · Feb 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -181,7 +181,6 @@ cython_debug/
 env
 *.pyc
 output/
-data/
 lm_cache
 .idea
 build
@@ -213,7 +212,7 @@ LLaVA/
 temp/
 InternVL/
 logs/
-data/
+/data/
 llava-video/
 Video-MME/
 VATEX/

diff --git a/inputs/MEVA b/inputs/MEVA
@@ -0,0 +1 @@
+/nas/mars/dataset/MEVA
diff --git a/meva/.gitignore b/meva/.gitignore
@@ -27,11 +27,18 @@ Thumbs.db
 # Data (local only, not committed)
 output/
 data/qa_pairs/
+data/entity_descriptions/
+data/gpt_logs/
 *.log
 
-# Allow these specific preprocessing files
+# Allow these specific data files needed by v10 pipeline
 !data/annotated_activity_slots.txt
 !data/slot_index.json
+!data/canonical_slots.json
+!data/geom_slot_index.json
+!data/person_database.json
+!data/person_database_yolo.json
+!data/mevid_supported_slots.json
 
 # Jupyter
 .ipynb_checkpoints/
@@ -41,6 +48,7 @@ data/qa_pairs/
 tmp/
 temp/
 *.tmp
+*.bak
 
 # Local configuration (copy from config_template.py)
 config.py
diff --git a/meva/1_run_all_slots.sh b/meva/1_run_all_slots.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+LIST_FILE="data/slot_list_from_slot_index.txt"
+OUTPUT_DIR="/nas/neurosymbolic/multi-cam-dataset/meva/data"
+RUN_FIRST_TEN="true"
+
+mkdir -p "$OUTPUT_DIR/qa_pairs/raw"
+export MEVA_OUTPUT_DIR="/nas/neurosymbolic/multi-cam-dataset/meva/data_all_slots/"
+
+TIMESTAMP="$(date +"%Y%m%d-%H%M%S")"
+LOG_DIR="${MEVA_OUTPUT_DIR}/logs"
+mkdir -p "$LOG_DIR"
+exec > >(tee -a "${LOG_DIR}/${TIMESTAMP}_1_run_all_slots.log") 2>&1
+
+count=0
+while IFS= read -r slot; do
+  if [[ -z "$slot" ]]; then
+    continue
+  fi
+  if [[ "$RUN_FIRST_TEN" == "true" && "$count" -ge 10 ]]; then
+    break
+  fi
+  python -m scripts.v10.run_pipeline \
+    --slot "$slot" \
+    -v \
+    --seed 42
+  count=$((count + 1))
+done < "$LIST_FILE"
diff --git a/meva/2_run_naturalize_all_slots.sh b/meva/2_run_naturalize_all_slots.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+LIST_FILE="data/slot_list_from_slot_index.txt"
+RUN_FIRST_TEN="true"
+MODEL="gpt-5.2"
+TEMPERATURE="0.3"
+VERBOSE="true"
+
+export MEVA_OUTPUT_DIR="/nas/neurosymbolic/multi-cam-dataset/meva/data_all_slots/"
+mkdir -p "$MEVA_OUTPUT_DIR/qa_pairs/raw"
+
+TIMESTAMP="$(date +"%Y%m%d-%H%M%S")"
+LOG_DIR="${MEVA_OUTPUT_DIR}/logs"
+mkdir -p "$LOG_DIR"
+exec > >(tee -a "${LOG_DIR}/${TIMESTAMP}_2_run_naturalize_all_slots.log") 2>&1
+
+count=0
+while IFS= read -r slot; do
+  if [[ -z "$slot" ]]; then
+    continue
+  fi
+  if [[ "$RUN_FIRST_TEN" == "true" && "$count" -ge 10 ]]; then
+    break
+  fi
+
+  input_path="$MEVA_OUTPUT_DIR/qa_pairs/raw/${slot}.raw.json"
+  if [[ ! -f "$input_path" ]]; then
+    echo "Skipping (missing raw): $input_path"
+    continue
+  fi
+
+  args=(--input "$input_path" --model "$MODEL" --temperature "$TEMPERATURE" --yes)
+  if [[ "$VERBOSE" == "true" ]]; then
+    args+=("-v")
+  fi
+  python -m scripts.v10.naturalize "${args[@]}"
+
+  count=$((count + 1))
+done < "$LIST_FILE"
diff --git a/meva/3_convert_naturalized_to_standard.sh b/meva/3_convert_naturalized_to_standard.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+INPUT_DIR="/home/ss99569/code/multi-cam/Multi-Camera/datasets/multi-cam-dataset/meva/data_all_slots/qa_pairs/raw"
+OUTPUT_DIR="/home/ss99569/code/multi-cam/Multi-Camera/datasets/multi-cam-dataset/meva/"
+
+TIMESTAMP="$(date +"%Y%m%d-%H%M%S")"
+LOG_DIR="${OUTPUT_DIR}/logs"
+mkdir -p "$LOG_DIR"
+exec > >(tee -a "${LOG_DIR}/${TIMESTAMP}_3_convert_naturalized_to_standard.log") 2>&1
+
+python /home/ss99569/code/multi-cam/Multi-Camera/meva/scripts/convert_naturalized_to_standard.py \
+  --input-dir "$INPUT_DIR" \
+  --output-dir "$OUTPUT_DIR"
diff --git a/meva/QUICKSTART.md b/meva/QUICKSTART.md
@@ -0,0 +1,134 @@
+# MEVA QA Pipeline — Quick Start
+
+## Prerequisites
+
+- Access to `/nas/mars/dataset/MEVA/` (shared on the `/nas` machine and remote cluster)
+- Python 3.10+
+- `OPENAI_API_KEY` set in your environment (required for Step 2 — naturalization only)
+
+### Install dependencies
+```bash
+pip install pyyaml numpy opencv-python openai
+```
+Or use the shared venv:
+```bash
+source /home/ah66742/venv/bin/activate
+```
+
+---
+
+## Running the Pipeline
+
+All commands must be run from the **`meva/` directory** inside this repo:
+
+```bash
+cd /path/to/repo/meva
+export PYTHONPATH=$PYTHONPATH:$(pwd)
+export OPENAI_API_KEY=sk-...          # only needed for Step 2
+export OUTPUT_DIR=~/data              # where QA JSON + logs are saved (default: ~/data)
+```
+
+### Step 1 — Raw QA generation (free, ~5 seconds per slot)
+
+```bash
+python3 -m scripts.v10.run_pipeline --slot "2018-03-15.15-00.school" -v --seed 42
+```
+
+Output: `$MEVA_OUTPUT_DIR/qa_pairs/2018-03-15.15-00.school/2018-03-15.15-00.school.final.raw.json`
+
+### Step 2 — Naturalize with GPT (costs tokens, ~$0.002/slot with gpt-4o-mini)
+
+```bash
+python3 -m scripts.v10.naturalize \
+  --input $MEVA_OUTPUT_DIR/qa_pairs/2018-03-15.15-00.school/2018-03-15.15-00.school.final.raw.json \
+  -v --yes
+```
+
+Output: `...2018-03-15.15-00.school.final.naturalized.json`
+
+Use `--preprocess-only` to run deterministic text cleanup without any GPT call (free):
+```bash
+python3 -m scripts.v10.naturalize --input <raw.json> --preprocess-only
+```
+
+### Step 3 — Export to multi-cam-dataset format
+
+```bash
+python3 -m scripts.v10.export_to_multicam_format --slot "2018-03-15.15-00.school"
+```
+
+Output: `/nas/neurosymbolic/multi-cam-dataset/meva/qa_pairs/2018-03-15.15-00.school.json`
+
+### All-in-one
+
+Pass a slot as argument (or edit `SLOT=` inside the script):
+
+```bash
+bash run.sh "2018-03-15.15-00.school"
+```
+
+---
+
+## What's in `meva/data/`?
+
+These files are checked into the repo and are automatically found by the scripts:
+
+| File | Contents |
+|------|----------|
+| `data/canonical_slots.json` | 929 annotated slots with camera lists |
+| `data/slot_index.json` | Clip-level annotation index |
+| `data/person_database_yolo.json` | MEVID person descriptions (YOLO+GPT) |
+| `data/person_database.json` | MEVID person descriptions (original) |
+| `data/mevid_supported_slots.json` | Slots with MEVID re-ID coverage |
+| `data/geom_slot_index.json` | Geom-file index for entity descriptions |
+
+---
+
+## Slot Name Format
+
+Slots follow the pattern: `YYYY-MM-DD.HH-MM.site`
+
+Example: `2018-03-15.15-00.school`
+
+Sites: `school`, `admin`, `bus`, `hospital`
+
+To list all available annotated slots:
+```bash
+python3 -m scripts.v10.run_pipeline --list-slots
+```
+
+---
+
+## Output Layout
+
+```
+$MEVA_OUTPUT_DIR/
+  qa_pairs/
+    2018-03-15.15-00.school/
+      2018-03-15.15-00.school.final.raw.json          ← Step 1 output
+      2018-03-15.15-00.school.final.naturalized.json  ← Step 2 output
+      validation_videos/                              ← optional render step
+  gpt_logs/
+    2018-03-15.15-00.school/
+      naturalize_gpt-4o-mini.json
+  entity_descriptions/
+    2018-03-15.15-00.school.json                      ← auto-generated on first run
+```
+
+---
+
+## Common Issues
+
+**`canonical slot and slot index is required`**  
+→ You ran `python -m meva.scripts.v10.run_pipeline` from the repo root. There is no `meva/__init__.py`, so that path doesn't work.  
+→ **Fix**: `cd meva/ && export PYTHONPATH=$PYTHONPATH:$(pwd)` first, then use `python3 -m scripts.v10.run_pipeline`.
+
+**`No events found for slot ...`**  
+→ Check the slot exists: `python3 -m scripts.v10.run_pipeline --list-slots | grep <slot>`  
+→ Slot format must be `YYYY-MM-DD.HH-MM.site` (no seconds).
+
+**`OPENAI_API_KEY not set`**  
+→ Only needed for Step 2. Steps 1 and 3 are free.
+
+**Output goes to wrong directory**  
+→ Set `export OUTPUT_DIR=/your/home/data` before running.
-Original file line number
+Diff line change
@@ Expand Up / @@ -181,7 +181,6 @@ cython_debug/ @@
     env
     *.pyc
     output/
-    data/
     lm_cache
     .idea
     build
@@ Expand Down Expand Up / @@ -213,7 +212,7 @@ LLaVA/ @@
     temp/
     InternVL/
     logs/
-    data/
+    /data/
     llava-video/
     Video-MME/
     VATEX/
@@ Expand Down @@