import json
import uuid
from datetime import datetime, timezone
from pathlib import Path
 
def convert_to_label_studio_format(dls_jsonl_path, ls_json_path, ls_document_root, input_folder, output_images_folder, project_id=1):
    """
    Converts DLS (Data Labeling Service) JSONL annotations to Label Studio compatible JSON format.

    This function reads annotations from a DLS JSONL file, processes them to align with 
    Label Studio's expected import format, and saves the converted annotations to a specified path. 
    
    Args:
        dls_jsonl_path (str): Path to the input DLS JSONL annotation file.
        ls_json_path (str): Path where the converted Label Studio JSON file will be saved.
        input_folder (str): Root directory containing the input documents (e.g., PDFs, images).
        output_images_folder (str): Directory where the processed image files are or will be stored.
        ls_document_root (str): The document root prefix expected by Label Studio for image references.
        project_id (int, optional): ID of the Label Studio project. Defaults to 1.

    Example:
        convert_dls_to_ls_json(
            "annotations.dls.jsonl",
            "annotations.ls.json",
            "input_pdfs",
            "out_input_pdfs",
            "/datasets",
            project_id=2
        )
    """
    with open(dls_jsonl_path, 'r') as f:
        records = [json.loads(line.strip()) for line in f]
 
    tasks = []
    task_id = 1
    annotation_id = 1
    user_id = 1
 
    """
    # Derive the folder names for LS relative paths
    input_folder_name = Path(input_folder).name
    output_images_folder_name = Path(output_images_folder).name
    """
 
    # Derive relative paths based on ls_document_root
    input_folder_name = Path(input_folder).relative_to(ls_document_root)
    output_images_folder_name = Path(output_images_folder).relative_to(ls_document_root)
 
 
    for record in records:
        doc_path = record['sourceDetails']['path']
        doc_name = Path(doc_path).stem
        doc_ext = Path(doc_path).suffix.lower()
 
        if doc_ext == '.pdf':
            document_url = f"/data/local-files/?d={input_folder_name}/{doc_path}"
            pages = [{"page": f"/data/local-files/?d={output_images_folder_name}/{doc_name}/page_{page['pageNumber']}.png"} for page in record['annotations'][0]['pages']]
        else:
            document_url = f"/data/local-files/?d={input_folder_name}/{doc_path}"
            pages = [{"page": document_url}]
        results = []
        annotation = record['annotations'][0]
        pages_list = annotation.get('pages', [])
 
        for page_index, page in enumerate(pages_list):
            page_entities = page.get('entities', [])
            for entity in page_entities:
                text = entity['text']
                labels = entity['labels']
                if not labels:
                    labels = [{"label_name": "ignore"}]
 
                # Process points scaled to 100
                points = []
                for vertex in entity['boundingPolygon']['normalizedVertices']:
                    x = float(vertex['x']) * 100
                    y = float(vertex['y']) * 100
                    points.append([x, y])
 
                unique_id = str(uuid.uuid4())[:4]
                score = float(entity.get('confidence', 0.99))
 
                # Bbox annotation
                bbox_annotation = {
                    "original_width": 100,
                    "original_height": 100,
                    "image_rotation": 0,
                    "value": {
                        "points": points,
                        "closed": True
                    },
                    "id": unique_id,
                    "from_name": f"bbox_{page_index}",
                    "to_name": f"page_{page_index}",
                    "type": "rectangle",
                    "origin": "prediction",
                    "score": score
                }
                results.append(bbox_annotation)
 
                # Transcription annotation
                transcription_annotation = {
                    "original_width": 100,
                    "original_height": 100,
                    "image_rotation": 0,
                    "value": {
                        "points": points,
                        "closed": True,
                        "text": [text]
                    },
                    "id": unique_id,
                    "from_name": f"transcription_{page_index}",
                    "to_name": f"page_{page_index}",
                    "type": "textarea",
                    "origin": "prediction",
                    "score": score
                }
                results.append(transcription_annotation)
 
                # Labels annotation
                label_names = [label['label_name'] for label in labels]
                labels_annotation = {
                    "original_width": 100,
                    "original_height": 100,
                    "image_rotation": 0,
                    "value": {
                        "points": points,
                        "closed": True,
                        "labels": label_names
                    },
                    "id": unique_id,
                    "from_name": f"labels_{page_index}",
                    "to_name": f"page_{page_index}",
                    "type": "labels",
                    "origin": "prediction",
                    "score": score
                }
                results.append(labels_annotation)
 
        current_time = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z')
        task = {
            "id": task_id,
            "annotations": [{
                "id": annotation_id,
                "completed_by": user_id,
                "result": results,
                "was_cancelled": False,
                "ground_truth": False,
                "created_at": current_time,
                "updated_at": current_time,
                "draft_created_at": current_time,
                "lead_time": 10.0,
                "prediction": {},
                "result_count": len(results),
                "unique_id": str(uuid.uuid4()),
                "import_id": None,
                "last_action": None,
                "bulk_created": False,
                "task": task_id,
                "project": project_id,
                "updated_by": user_id,
                "parent_prediction": None,
                "parent_annotation": None,
                "last_created_by": None
            }],
            "file_upload": "converted_task.json",
            "drafts": [],
            "predictions": [],
            "data": {
                "document": document_url,
                "pages": pages,
                "ls_document_root": ls_document_root
            },
            "meta": {},
            "created_at": current_time,
            "updated_at": current_time,
            "inner_id": 1,
            "total_annotations": 1,
            "cancelled_annotations": 0,
            "total_predictions": 0,
            "comment_count": 0,
            "unresolved_comment_count": 0,
            "last_comment_updated_at": None,
            "project": project_id,
            "updated_by": user_id,
            "comment_authors": []
        }
 
        tasks.append(task)
        task_id += 1
        annotation_id += 1
 
    with open(ls_json_path, 'w') as f_out:
        json.dump(tasks, f_out, indent=4)
 
    print(f"Converted {len(tasks)} records to Label Studio import format at {ls_json_path}")
 
## Test example
dls_jsonl_path=r"/home/raraushk/LS_integration/label_studio/records_1750296882615.jsonl"
ls_json_path=r"annotations_ls.json"
input_folder=r"/home/raraushk/LS_integration/label_studio/datasets/input_pdfs"
output_images_folder=r"/home/raraushk/LS_integration/label_studio/datasets/output_images"
ls_document_root=r"/home/raraushk/LS_integration/label_studio/datasets"
convert_to_label_studio_format(dls_jsonl_path, ls_json_path, ls_document_root, input_folder, output_images_folder)