# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
RLVE-Gym Environment Implementation.
"""

import os
from typing import Optional, Tuple
import random

from openenv_core.env_server.interfaces import Environment

from models import RlveGymState, RlveGymAction, RlveGymObservation
from server.Gym.environment import VerifiableEnvironment
from server.Gym.parameter_controller import ParameterController
from server.Gym.environments import identifier2environment
from server.Gym.parameter_controllers import identifier2controller


class RlveGymEnvironment(Environment):
    """
    Wrap any verifiable environment from RLVE-Gym behind the OpenEnv ``Environment`` API.
    """

    def __init__(
        self,
        environment_identifier: str = None,
        difficulty: int = None,
        answer_markers: Optional[Tuple[str, str]] = None,
        initial_seed: int = None,
    ):
        """
        Initialize the RLVE_Gym environment.

        Args:
            environment_identifier (str): The environment's identifier. Check server/Gym/environments/__init__.py for detailed usage.
            difficulty (int): The difficulty of generated problems.
            answer_markers (Tuple[str] of length 2): How the environment extracts the final answer from a model output.
            initial_seed (int): The initial seed to use when generating the first problem. Whenever reset() is called, the seed will be incremented by 1.
        """

        if environment_identifier is not None :
            self.environment_identifier = environment_identifier
        else :
            self.environment_identifier = os.getenv("RLVEGYM_ENVIRONMENT_IDENTIFIER", default = "Multiplication")
        
        if difficulty is not None :
            self.difficulty = difficulty
        else :
            self.difficulty = int(os.getenv("RLVEGYM_DIFFICULTY", default = "0"))
        
        if answer_markers is not None :
            self.answer_markers = answer_markers
        else :
            self.answer_markers = (os.getenv("RLVEGYM_ANSWER_MARKER_START", default = r"<answer>"), os.getenv("RLVEGYM_ANSWER_MARKER_END", default = r"</answer>"))
        
        if initial_seed is not None :
            pass
        else :
            initial_seed = int(os.getenv("RLVEGYM_INITIAL_SEED", default = "0"))

        self._state = RlveGymState(
            seed=initial_seed,
            problem_input=None,
            num_samples=0,
            sum_accuracy=0,
        )

        self.problem = None

    def reset(self) -> RlveGymObservation:
        """
        Reset the environment.

        Returns:
            problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run or has failed.
            verifier_result (Optional[dict]): Contains reward as the raw reward, accuracy as the 0/1 correctness, and format_score as the 0/1 format correctness; if it is None, it means that the verification has failed.
            success (bool): True or False indicates whether the operation succeeded.
            message (str): The explanation of success.
            reward (Optional[float]): The value is verifier_result["reward"] when verifier_result is not None (otherwise, reward is also None).
        """
        if (self.environment_identifier not in identifier2environment) or (
            self.environment_identifier not in identifier2controller
        ):
            return RlveGymObservation(
                problem_input=None,
                verifier_result=None,
                success=False,
                message="Invalid environment identifier.",
                reward=None,
            )
        if not (isinstance(self.difficulty, int) and self.difficulty >= 0):
            return RlveGymObservation(
                problem_input=None,
                verifier_result=None,
                success=False,
                message="Difficulty should be a non-negative integer.",
                reward=None,
            )
        if not (isinstance(self._state.seed, int) and self._state.seed >= 0):
            return RlveGymObservation(
                problem_input=None,
                verifier_result=None,
                success=False,
                message="Seed should be a non-negative integer.",
                reward=None,
            )

        try:
            problem: VerifiableEnvironment = identifier2environment[self.environment_identifier](
                answer_markers=self.answer_markers
            )
        except Exception as e:
            return RlveGymObservation(
                problem_input=None,
                verifier_result=None,
                success=False,
                message=f"Failed to initialize environment: {e}",
                reward=None,
            )

        controller: ParameterController = identifier2controller[self.environment_identifier]()
        for _ in range(self.difficulty):
            controller.update()
        random.seed(self._state.seed)
        parameter = random.choice(controller.get_parameter_list())

        if problem.generator(seed=self._state.seed, parameter=parameter):
            self._state.problem_input = problem.prompt_generator()
            self.problem = problem
        else:
            self._state.problem_input = None
            self.problem = None

        self._state.seed += 1
        self._state.num_samples = self._state.sum_accuracy = 0

        if self.problem is not None:
            return RlveGymObservation(
                problem_input=self._state.problem_input,
                verifier_result=None,
                success=True,
                message="Problem generated successfully.",
                reward=None,
            )
        else:
            return RlveGymObservation(
                problem_input=None,
                verifier_result=None,
                success=False,
                message="Problem generation failed. Please try decreasing difficulty or changing seed.",
                reward=None,
            )

    def step(self, action: RlveGymAction) -> RlveGymObservation:  # type: ignore[override]
        """
        Execute a step in the environment by verifying the model output.

        Args:
            action (RlveGymAction): Contains a single field:
                - output (str): The model's output to get verified.

        Returns:
            problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run or has failed.
            verifier_result (Optional[dict]): Contains reward as the raw reward, accuracy as the 0/1 correctness, and format_score as the 0/1 format correctness; if it is None, it means that the verification has failed.
            success (bool): True or False indicates whether the operation succeeded.
            message (str): The explanation of success.
            reward (Optional[float]): The value is verifier_result["reward"] when verifier_result is not None (otherwise, reward is also None).
        """
        if self.problem is None:
            return RlveGymObservation(
                problem_input=None,
                verifier_result=None,
                success=False,
                message="Problem not ready. Please reset the environment.",
                reward=None,
            )

        try:
            verifier_result = self.problem.verifier(action.output)
        except Exception as e:
            return RlveGymObservation(
                problem_input=self._state.problem_input,
                verifier_result=None,
                success=False,
                message=f"Verification failed with error: {e}",
                reward=None,
            )

        self._state.num_samples += 1
        self._state.sum_accuracy += verifier_result["accuracy"]

        return RlveGymObservation(
            problem_input=self._state.problem_input,
            verifier_result=verifier_result,
            success=True,
            message="Verification completed.",
            reward=verifier_result["reward"],
        )

    @property
    def state(self) -> RlveGymState:
        """
        Get the current environment state.

        Returns:
            seed (int): The seed to use when running reset().
            problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run, or it failed.
            num_samples (int) and sum_accuracy (int): The statistics of the result of `step(action)` so far for the current problem (the number of outputs sent to the verifier and the number of correct ones).
        """
        return self._state