"""
Generates the AqSolDB visual graph dataset from a CSV base file. This dataset consists of roughly 10_000
molecular graphs, which are annotated with measured values of water solubility as target values.

CHANGELOG

0.1.0 - 29.01.2023 - Initial version

0.2.0 - 24.03.2023 - Changed the dataset generation to now include .meta.yml dataset metadata
file and process.py standalone pre-processing module in the dataset folder as well.
"""
import os
import pathlib
import typing as t

from pycomex.util import Skippable
from pycomex.experiment import SubExperiment

# == DATASET PARAMETERS ==
FILE_SHARE_PROVIDER: str = 'main'
CSV_FILE_NAME: str = 'source/aqsoldb.csv'
INDEX_COLUMN_NAME: t.Optional[str] = None
SMILES_COLUMN_NAME: str = 'SMILES'
TARGET_COLUMN_NAMES: t.List[str] = ['Solubility']
# For this dataset we actually have a canonical train-test split from the literature.
SPLIT_COLUMN_NAMES: t.Dict[int, str] = {
    0: 'split'
}

# == DATASET PARAMETERS ==
DATASET_NAME: str = 'aqsoldb'
DATASET_META: t.Optional[dict] = {
    'version': '0.2.0',
    'changelog': [
        '0.1.0 - 29.01.2023 - initial version',
        '0.2.0 - 24.03.2023 - Changed the dataset generation to now include .meta.yml dataset metadata '
        'file and process.py standalone pre-processing module in the dataset folder as well.'
    ],
    'description': (
        'Dataset consisting of roughly 10_000 molecular graphs annotated with measured values of their '
        'corresponding solubility (logS) value in water.'
    ),
    'references': [
        'Library used for the processing and visualization of molecules. https://www.rdkit.org/',
    ],
    'visualization_description': (
        'Molecular graphs generated by RDKit based on the SMILES representation of the molecule.'
    ),
    'target_descriptions': {
        0: 'measured logS values of the molecules solubility in Water. (unmodified)'
    }
}

# == EXPERIMENT PARAMETERS ==
PATH = pathlib.Path(__file__).parent.absolute()
EXPERIMENT_PATH = os.path.join(PATH, 'generate_molecule_dataset_from_csv.py')
BASE_PATH = os.getcwd()
NAMESPACE = 'results/generate_molecule_dataset_from_csv_aqsoldb'
DEBUG = True
with Skippable(), (se := SubExperiment(EXPERIMENT_PATH, BASE_PATH, NAMESPACE, globals())):

    # ~ Adding filters to the dataset processing step
    # By adding these specific filters to the pre-processing of the dataset we implement the same processing
    # steps described in the original paper which introduces this dataset.

    def is_charged(mol, data):
        smiles = data['smiles']
        return '+' in smiles or '-' in smiles

    def is_adjoined_mixture(mol, data):
        smiles = data['smiles']
        return '.' in smiles

    def no_carbon(mol, data):
        smiles = data['smiles']
        return 'C' not in smiles

    @se.hook('modify_filter_callbacks')
    def add_filters(e, filter_callbacks: t.List[t.Callable]):
        filter_callbacks.append(is_charged)
        filter_callbacks.append(is_adjoined_mixture)
        return filter_callbacks
