unit HyperparameterOptimizer;
{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  SysUtils, Classes, Transformer, MatrixOps, DataUtils, TrainerUnit, Math;

type
  THyperparameterConfig = record
    LearningRates: TDoubleArray;
    BatchSizes,
    NumHeadsOptions,
    FFNDimOptions,
    NumLayersOptions: TIntegerArray;
    DropoutRates,
    WeightDecays: TDoubleArray;
    MaxEpochs: Integer;
    ValidationSplit: Double;
  end;

  TBestConfig = record
    LearningRate: Double;
    BatchSize,
    NumHeads,
    FFNDim,
    NumLayers: Integer;
    DropoutRate,
    WeightDecay,
    ValidationLoss: Double;
  end;

function CreateDefaultHyperparameterConfig: THyperparameterConfig;
function FindBestHyperparameters(const Dataset: TTrainingDataset; const Config: THyperparameterConfig): TBestConfig;
procedure RandomSearch(var Model: TTransformer; const Dataset: TTrainingDataset; const Config: THyperparameterConfig; NumTrials: Integer);
procedure FilterEmptyExamples(var Dataset: TTrainingDataset);
procedure SimpleHyperparameterTuning;

implementation

function CreateDefaultHyperparameterConfig: THyperparameterConfig;
begin
  with Result do
  begin
    LearningRates := [0.001, 0.0005, 0.0001, 0.00005, 0.00001];
    BatchSizes := [8, 16, 32, 64];
    NumHeadsOptions := [2, 4, 8];
    FFNDimOptions := [256, 512, 1024];
    NumLayersOptions := [1, 2, 3, 4];
    DropoutRates := [0.0, 0.1, 0.2, 0.3];
    WeightDecays := [0.0, 0.0001, 0.00001];
    MaxEpochs := 10;
    ValidationSplit := 0.2;
  end;
end;

function FindBestHyperparameters(const Dataset: TTrainingDataset; const Config: THyperparameterConfig): TBestConfig;
var
  i, j, k, l, m, n, o: Integer;
  tempModel: TTransformer;
  trainData, valData: TTrainingDataset;
  bestLoss, valLoss: Double;
  currentConfig: TTransformerConfig;
  filteredData: TTrainingDataset;
begin
  // Инициализируем лучшую конфигурацию безопасными значениями по умолчанию
  Result.LearningRate := 0.0001;
  Result.BatchSize := 16;
  Result.NumHeads := 4;
  Result.FFNDim := 512;
  Result.NumLayers := 2;
  Result.DropoutRate := 0.1;
  Result.WeightDecay := 0.0001;
  Result.ValidationLoss := MaxDouble;

  // ФИЛЬТРУЕМ пустые примеры!
  filteredData := Copy(Dataset); // Создаем копию
  FilterEmptyExamples(filteredData);

  if Length(filteredData) = 0 then
  begin
    WriteLn('ERROR: No valid examples in dataset after filtering!');
    Exit;
  end;

  // Разделяем ОТФИЛЬТРОВАННЫЕ данные
  SplitDataset(filteredData, trainData, valData, Config.ValidationSplit);

  // Разделяем данные на обучение и валидацию
  SplitDataset(Dataset, trainData, valData, Config.ValidationSplit);

  WriteLn('Starting hyperparameter optimization...');
  WriteLn('Total combinations: ', 
    Length(Config.LearningRates) * Length(Config.BatchSizes) * 
    Length(Config.NumHeadsOptions) * Length(Config.FFNDimOptions) *
    Length(Config.NumLayersOptions) * Length(Config.DropoutRates) *
    Length(Config.WeightDecays));

  // Grid search по всем комбинациям
  for i := 0 to High(Config.LearningRates) do
  for j := 0 to High(Config.BatchSizes) do
  for k := 0 to High(Config.NumHeadsOptions) do
  for l := 0 to High(Config.FFNDimOptions) do
  for m := 0 to High(Config.NumLayersOptions) do
  for n := 0 to High(Config.DropoutRates) do
  for o := 0 to High(Config.WeightDecays) do
  begin
    // Пропускаем невалидные комбинации
    if (Config.NumHeadsOptions[k] <= 0) or 
       (Config.FFNDimOptions[l] <= 0) or
       (Config.NumLayersOptions[m] <= 0) or
       (Config.BatchSizes[j] <= 0) then Continue;

    if Config.FFNDimOptions[l] mod Config.NumHeadsOptions[k] <> 0 then
      Continue;

    // Защита от слишком больших значений
    if (Config.NumLayersOptions[m] > 12) or (Config.NumHeadsOptions[k] > 16) or
       (Config.FFNDimOptions[l] > 2048) then
      Continue;

    // Настраиваем конфигурацию
    with currentConfig do begin
      InputSize := 300;
      NumLayers := Config.NumLayersOptions[m];
      NumHeads := Config.NumHeadsOptions[k];
      FFNDim := Config.FFNDimOptions[l];
      MaxSeqLength := 100;
      DropoutRate := Config.DropoutRates[n];
      WeightDecay := Config.WeightDecays[o];
      GradientClipValue := 1.0;
      UseLayerNorm := True;
    end;

    try
      // Инициализируем модель
      InitTransformer(tempModel, currentConfig);

      // Обучаем на части данных
      TrainTransformerWithValidation(tempModel, trainData, valData, Config.LearningRates[i], Config.MaxEpochs, Config.BatchSizes[j]);

      // Оцениваем на validation set
      valLoss := EvaluateModel(tempModel, valData);

      WriteLn('Config: lr=', Config.LearningRates[i]:0:6, 
              ', bs=', Config.BatchSizes[j],
              ', heads=', Config.NumHeadsOptions[k],
              ', ffn=', Config.FFNDimOptions[l],
              ', layers=', Config.NumLayersOptions[m],
              ', dropout=', Config.DropoutRates[n]:0:1,
              ', wd=', Config.WeightDecays[o]:0:6,
              ', loss=', valLoss:0:4);

      // Сохраняем лучшую конфигурацию
      if valLoss < bestLoss then begin
        bestLoss := valLoss;
        Result.LearningRate := Config.LearningRates[i];
        Result.BatchSize := Config.BatchSizes[j];
        Result.NumHeads := Config.NumHeadsOptions[k];
        Result.FFNDim := Config.FFNDimOptions[l];
        Result.NumLayers := Config.NumLayersOptions[m];
        Result.DropoutRate := Config.DropoutRates[n];
        Result.WeightDecay := Config.WeightDecays[o];
        Result.ValidationLoss := valLoss;
      end;

  valLoss := EvaluateModel(tempModel, valData);

      // Очищаем память
      FreeTransformer(tempModel);

    except
      on E: Exception do
        WriteLn('Error with config: ', E.Message);
    end;
  end;
end;

procedure RandomSearch(var Model: TTransformer; const Dataset: TTrainingDataset; const Config: THyperparameterConfig; NumTrials: Integer);
var
  i: Integer;
  bestLoss: Double;
  bestConfig: TBestConfig;
  trainData, valData: TTrainingDataset;
  tempModel: TTransformer;
  currentConfig: TBestConfig;
  transformerConfig: TTransformerConfig;
begin
  bestLoss := MaxDouble;
  SplitDataset(Dataset, trainData, valData, Config.ValidationSplit);

  for i := 1 to NumTrials do begin
    // Случайный подбор параметров
    currentConfig.LearningRate := Config.LearningRates[Random(Length(Config.LearningRates))];
    currentConfig.BatchSize := Config.BatchSizes[Random(Length(Config.BatchSizes))];
    currentConfig.NumHeads := Config.NumHeadsOptions[Random(Length(Config.NumHeadsOptions))];
    currentConfig.FFNDim := Config.FFNDimOptions[Random(Length(Config.FFNDimOptions))];
    currentConfig.NumLayers := Config.NumLayersOptions[Random(Length(Config.NumLayersOptions))];
    currentConfig.DropoutRate := Config.DropoutRates[Random(Length(Config.DropoutRates))];
    currentConfig.WeightDecay := Config.WeightDecays[Random(Length(Config.WeightDecays))];

    // Пропускаем невалидные комбинации
    if currentConfig.FFNDim mod currentConfig.NumHeads <> 0 then Continue;

    with transformerConfig do begin
      InputSize := 300;
      NumLayers := currentConfig.NumLayers;
      NumHeads := currentConfig.NumHeads;
      FFNDim := currentConfig.FFNDim;
      MaxSeqLength := 100;
      DropoutRate := currentConfig.DropoutRate;
      WeightDecay := currentConfig.WeightDecay;
      GradientClipValue := 1.0;
      UseLayerNorm := True;
    end;

    InitTransformer(tempModel, transformerConfig);

    try
      TrainTransformerWithValidation(tempModel, trainData, valData, 
                                   currentConfig.LearningRate, Config.MaxEpochs,
                                   currentConfig.BatchSize);

      currentConfig.ValidationLoss := EvaluateModel(tempModel, valData);

      if currentConfig.ValidationLoss < bestLoss then
      begin
        bestLoss := currentConfig.ValidationLoss;
        bestConfig := currentConfig;
      end;

      FreeTransformer(tempModel);

    except
      on E: Exception do
        WriteLn('Trial ', i, ' failed: ', E.Message);
    end;
  end;
end;

// Добавим в HyperparameterOptimizer.pas
procedure FilterEmptyExamples(var Dataset: TTrainingDataset);
var
  i, validCount: Integer;
  filteredData: TTrainingDataset;
begin
  validCount := 0;

  // Сначала подсчитаем valid examples
  for i := 0 to High(Dataset) do begin
    if (Dataset[i].Input.Length > 0) and 
       (Dataset[i].ExpectedOutput.Length > 0) and
       (Dataset[i].Input.Trim <> '') and
       (Dataset[i].ExpectedOutput.Trim <> '') then
    begin
      Inc(validCount);
    end;
  end;

  if validCount = Length(Dataset) then
    Exit; // Все примеры valid, ничего не делаем

  WriteLn('Filtering dataset: ', Length(Dataset), ' -> ', validCount, ' examples');

  // Создаем отфильтрованный датасет
  SetLength(filteredData, validCount);
  validCount := 0;

  for i := 0 to High(Dataset) do begin
    if (Dataset[i].Input.Length > 0) and 
       (Dataset[i].ExpectedOutput.Length > 0) and
       (Dataset[i].Input.Trim <> '') and
       (Dataset[i].ExpectedOutput.Trim <> '') then
    begin
      filteredData[validCount] := Dataset[i];
      Inc(validCount);
    end
    else
    begin
      WriteLn('Removing empty example ', i, ': Input="', Dataset[i].Input, 
              '", Output="', Dataset[i].ExpectedOutput, '"');
    end;
  end;

  Dataset := filteredData;
end;


procedure SimpleHyperparameterTuning;
var
  TrainingData: TTrainingDataset;
  bestLR: Double;
  bestLoss, currentLoss: Double;
  learningRates: array of Double;
  i: Integer;
  tempModel: TTransformer;
  tempData: TTrainingDataset;
begin
  if not FileExists('training_data.txt') then Exit;

  WriteLn('=== ПРОСТАЯ ОПТИМИЗАЦИЯ LEARNING RATE ===');
  
  LoadTrainingData(TrainingData, 'training_data.txt');
  FilterEmptyExamples(TrainingData);
  
  if Length(TrainingData) < 10 then
  begin
    WriteLn('Слишком мало данных для оптимизации: ', Length(TrainingData));
    Exit;
  end;

  // Тестируем разные learning rates
  learningRates := [0.001, 0.0005, 0.0001, 0.00005, 0.00001];
  bestLR := 0.0001;
  bestLoss := MaxDouble;

  for i := 0 to High(learningRates) do
  begin
    try
      WriteLn('Тестируем LR=', learningRates[i]:0:6);
      
      // Создаем временную модель для теста
      InitTransformer(tempModel, TransformerModel.Config);
      
      // Быстрое обучение на части данных
      SetLength(tempData, Min(10, Length(TrainingData)));
      Move(TrainingData[0], tempData[0], Length(tempData) * SizeOf(TTrainingExample));
      
      TrainTransformerWithIndexing(tempModel, tempData, 2, learningRates[i]);
      
      // Оцениваем потери
      currentLoss := EvaluateModel(tempModel, tempData);
      WriteLn('  Loss: ', currentLoss:0:4);
      
      if currentLoss < bestLoss then
      begin
        bestLoss := currentLoss;
        bestLR := learningRates[i];
      end;
      
      FreeTransformer(tempModel);
      
    except
      on E: Exception do
        WriteLn('  Ошибка: ', E.Message);
    end;
  end;

  WriteLn('Лучший learning rate: ', bestLR:0:6, ' (loss: ', bestLoss:0:4, ')');
  
  // Применяем лучший LR к основной модели
  if bestLoss < MaxDouble then
  begin
    WriteLn('Применяем лучший LR к основной модели...');
    // Здесь можно переобучить модель с лучшим LR
  end;
end;

end.