"""
Export medicines + dosage_cache as a single portable CSV.

Run this AFTER steps 1-3 (import + regex cache + Gemini cache) are done.
The output file can be copied to another environment and imported with
import_enriched.py — no Gemini calls needed on the target machine.

Usage:
    python scripts/export_enriched.py
    python scripts/export_enriched.py --output enriched_drugs.csv.gz
    python scripts/export_enriched.py --format xlsx --output enriched_drugs.xlsx
"""

import sys
import argparse
import time
from pathlib import Path
from sqlalchemy import text

sys.path.insert(0, str(Path(__file__).parent.parent))

from src.database.connection import engine

# All medicine columns we care about (matches Medicine model)
MEDICINE_COLS = [
    'product_id', 'product_name', 'marketer', 'salt_composition',
    'medicine_type', 'introduction', 'description', 'how_to_use',
    'safety_advise', 'if_miss', 'packaging_detail', 'package', 'qty',
    'product_form', 'mrp', 'prescription_req', 'fact_box', 'primary_use',
    'storage', 'use_of', 'common_side_effect',
    'alcohol_interaction', 'pregnancy_interaction', 'lactation_interaction',
    'driving_interaction', 'kidney_interaction', 'liver_interaction',
    'manufacturer_address', 'country_of_origin', 'q_a', 'how_it_works',
    'interaction', 'manufacturer_details', 'marketer_details',
    'expirtation', 'reference', 'source_file',
]

# Dosage cache columns (prefixed with dc_ to avoid collisions)
DOSAGE_COLS = [
    'dc_quantity', 'dc_dosage_code', 'dc_frequency',
    'dc_meal_preference', 'dc_duration',
    'dc_dose_morning', 'dc_dose_afternoon', 'dc_dose_night',
    'dc_age_group_dosages',
    'dc_generation_method', 'dc_confidence_score',
]


def main():
    parser = argparse.ArgumentParser(description="Export medicines + dosage cache to portable file")
    parser.add_argument('--output', type=str, default=None,
                        help="Output file path (default: scripts/enriched_drugs.csv.gz)")
    parser.add_argument('--format', choices=['csv', 'xlsx'], default='csv',
                        help="Output format (default: csv, gzip-compressed)")
    args = parser.parse_args()

    # Default output path
    if args.output:
        out_path = Path(args.output)
    else:
        ext = '.csv.gz' if args.format == 'csv' else '.xlsx'
        out_path = Path(__file__).parent / f'enriched_drugs{ext}'

    print("=" * 60)
    print("EXPORT ENRICHED MEDICINES + DOSAGE CACHE")
    print("=" * 60)

    # Build SELECT with LEFT JOIN so we get all medicines even without cache
    med_select = ', '.join(f'm.{c}' for c in MEDICINE_COLS)
    dc_select = ', '.join([
        'dc.quantity AS dc_quantity',
        'dc.dosage_code AS dc_dosage_code',
        'dc.frequency AS dc_frequency',
        'dc.meal_preference AS dc_meal_preference',
        'dc.duration AS dc_duration',
        'dc.dose_morning AS dc_dose_morning',
        'dc.dose_afternoon AS dc_dose_afternoon',
        'dc.dose_night AS dc_dose_night',
        'dc.age_group_dosages AS dc_age_group_dosages',
        'dc.generation_method AS dc_generation_method',
        'dc.confidence_score AS dc_confidence_score',
    ])

    query = f"""
        SELECT {med_select}, {dc_select}
        FROM medicines m
        LEFT JOIN dosage_cache dc ON m.id = dc.medicine_id
        ORDER BY m.id
    """

    print("Querying database...")
    start = time.time()

    import pandas as pd
    df = pd.read_sql(query, engine)
    query_time = time.time() - start

    print(f"Fetched {len(df):,} rows in {query_time:.1f}s")

    # Count how many have dosage cache
    cached = df['dc_dosage_code'].notna().sum()
    print(f"  With dosage cache: {cached:,}")
    print(f"  Without dosage cache: {len(df) - cached:,}")

    # Write output
    print(f"Writing to {out_path}...")
    start = time.time()

    if args.format == 'xlsx':
        df.to_excel(str(out_path), index=False, engine='openpyxl')
    else:
        df.to_csv(str(out_path), index=False, compression='gzip')

    write_time = time.time() - start
    size_mb = out_path.stat().st_size / (1024 * 1024)

    print(f"Written in {write_time:.1f}s ({size_mb:.1f} MB)")

    print("\n" + "=" * 60)
    print("EXPORT COMPLETE")
    print("=" * 60)
    print(f"  File: {out_path}")
    print(f"  Rows: {len(df):,}")
    print(f"  Size: {size_mb:.1f} MB")
    print(f"  Dosage cached: {cached:,} / {len(df):,}")
    print("=" * 60)
    print(f"\nTo import on another environment:")
    print(f"  python scripts/import_enriched.py --input {out_path.name}")


if __name__ == "__main__":
    main()
