import re
import sys, argparse
from pathlib import Path
import os, yaml, json


def process_yaml_object(yaml_object):
    json_schema_property = dict()

    if len(yaml_object) > 0:
        json_schema_property["properties"] = dict()
        for yaml_section_name, yaml_section in yaml_object.items():
            json_schema_property["properties"] |= {
                yaml_section_name: process_yaml_element(yaml_section)
            }

    return json_schema_property


def process_yaml_array(yaml_array):
    json_schema_property = dict()

    if len(yaml_array) > 0:

        # XXX The following block is commented out. It allows to have arrays where every element must be a subset of another element
        #     Right now it is forbidden, every element of an array must be of the exact same type of the other elements

        # if not all(
        #     isinstance(array_el, type(yaml_array[0])) for array_el in yaml_array[1:]
        # ):
        #     raise Exception(
        #         "You are trying to parse an heterogenous array, this is not supported. Check if you have strings starting with numbers! Abort."
        #     )
        # if isinstance(yaml_array[0], dict):
        #     biggest_array_el = max(yaml_array, key=lambda x: len(process_yaml_element(x)['properties'].keys()))
        #     json_schema_property |= {"items": process_yaml_element(biggest_array_el)}
        #     if not all(
        #         process_yaml_element(array_el)['properties'].keys() <= process_yaml_element(biggest_array_el)['properties'].keys()
        #         for array_el in yaml_array
        #     ):
        #         raise Exception(
        #             "You are trying to parse an heterogenous array, this is not supported. Check if you have strings starting with numbers! Abort."
        #         )
        # else:

        json_schema_property |= {"items": process_yaml_element(yaml_array[0])}
        if not all(
            process_yaml_element(array_el) == json_schema_property["items"]
            for array_el in yaml_array[1:]
        ):
            raise Exception(
                "You are trying to parse an heterogenous array, this is not supported. Check if you have strings starting with numbers! Abort."
            )

    return json_schema_property


def type_reduction(el):
    if isinstance(el, str):
        if len(el) == 0:
            return type(None)
        if el.lower() in ["true", "false"]:
            return bool
        if el[0].isdigit() or (el[0] == "." and el[1].isdigit()):
            return float
    if isinstance(el, int):
        return float
    return type(el)


# NUM_REGEX detects any "number-ish" string by detecting:
# - Strings starting with a number (numbers with "measurement units"): 012Gb
# - Strings starting with a dot and then a number (decimal numbers with an
#   implicit leading 0 and "measurement units"): .012Gb
# - Strings starting with an "x" - lower and upper case - and then a number
#   (hex numbers with "measurement units"): x012Gb/X012Gb
NUM_REGEX = re.compile("^[\\.xX]?\\d.*$")
# BOOL_REGEX detects any boolean-ish (true/false) string, regardless of its case
BOOL_REGEX = re.compile("^(?:true|false)$", re.IGNORECASE)


def process_yaml_element(yaml_element):
    # Let us treat everything as "number" (i.e. float)
    # In YAML syntax, a number without decimal separator is always an integer
    #   and there is no way to specify it as a float apart from adding a ".0"
    # In order to avoid any incosistency (e.g. a property intended to be a float
    #   but with a whole default value and written without decimal separator)
    #   we revert everything to float. "number" JSON Schema type will kindly
    #   accept whole numbers.
    # Having no way to specify a type in a YAML file, this more lenient approach
    #   (opposed to the strictier one based on XML files) may be the best one.

    # HACK: is important to check for bool before int, since bool is a subclass of int
    if isinstance(yaml_element, bool) or (
        isinstance(yaml_element, str) and BOOL_REGEX.match(yaml_element)
    ):
        return {
            "anyOf": [
                {"type": "boolean"},
                {"type": "string"},
            ]
        }
    elif isinstance(yaml_element, (int, float)) or (
        isinstance(yaml_element, str) and NUM_REGEX.match(yaml_element)
    ):
        return {
            "anyOf": [
                {"type": "integer"},
                {"type": "number"},
                {"type": "string"},
            ]
        }
    elif isinstance(yaml_element, str):
        if yaml_element == "auto":
            return {
                "anyOf": [
                    {"type": "boolean"},
                    {"type": "integer"},
                    {"type": "number"},
                    {"type": "string"},
                ]
            }
        else:
            return {"type": "string"}
    elif isinstance(yaml_element, dict):
        return {"type": "object"} | process_yaml_object(yaml_element)
    elif isinstance(yaml_element, list):
        return {"type": "array"} | process_yaml_array(yaml_element)
    elif isinstance(yaml_element, type(None)):
        return {
            "anyOf": [
                {"type": "null"},
                {"type": "string"},
            ]
        }
    else:
        raise Exception("Unexpected yaml type...?!")


JSON_SCHEMA_ROOT = {
    "$schema": "http://json-schema.org/schema#",
    "additionalProperties": False,
}

JSON_DUMP_DEFAULT_SETTINGS = {"ensure_ascii": False, "indent": 2}


def process_yaml_file(in_filename: str, add_root_filename: bool = False):
    with open(in_filename) as in_f:
        input_yaml = yaml.safe_load(in_f)
        if add_root_filename:
            base_filename_split = (
                os.path.basename(in_filename).split(".yaml")[0].split("-")
            )
            capitalized_filename = base_filename_split[0] + "".join(
                x.capitalize() for x in base_filename_split[1:]
            )

            json_schema = {
                "$schema": "http://json-schema.org/schema#",
                "type": "object",
                # "required": [capitalized_filename],
                "properties": {
                    capitalized_filename: process_yaml_element(input_yaml),
                },
            }
            json_schema = JSON_SCHEMA_ROOT | json_schema

        else:
            json_schema = JSON_SCHEMA_ROOT | process_yaml_element(input_yaml)
    return json_schema


if __name__ == "__main__":
    if sys.version_info <= (3, 9):
        sys.stdout.write("Sorry, requires at least Python 3.9\n")
        sys.exit(1)
    parser = argparse.ArgumentParser(
        description="Generate a JSON Schema from a YAML file",
    )
    parser.add_argument(
        "--input_file", "-i", help="Path to input YAML file", type=str, required=True
    )
    parser.add_argument(
        "--output_file", "-o", help="Path to output JSON Schema file", type=str
    )
    parser.add_argument(
        "--add_root_filename",
        help="Add capitalized filename as root node of the json schema obtained from the yaml file",
        default=False,
        action="store_true",
    )

    args = parser.parse_args()

    json_schema = process_yaml_file(args.input_file, args.add_root_filename)

    output_file = (
        args.output_file
        if args.output_file
        else Path(args.input_file).with_suffix(".schema.json")
    )
    with open(output_file, "w", encoding="utf-8") as out_f:
        json.dump(json_schema, out_f, **JSON_DUMP_DEFAULT_SETTINGS)
