unit TrainerUnit;
{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  SysUtils, DataUtils, Classes, LazUTF8, ucs4unit, ucs4opunit, ucs4functionsunit,
  Transformer, MatrixOps, TextEmbeddings, Attention,Math,Optimizers, GradientAccumulator,Word2Vec;

type
  TTrainingExample = record
    Input, ExpectedOutput, Context: string;
  end;

  TTrainingDataset = array of TTrainingExample;
  TTrainingDatasetArray = array of TTrainingDataset;

  TLearningRateScheduler = class
  private
    FInitialLR, FCurrentLR: Double;
    FStep, FWarmupSteps, FDecaySteps: Integer;
  public
    constructor Create(InitialLR: Double; WarmupSteps: Integer = 1000; DecaySteps: Integer = 10000);
    function GetLearningRate(Step: Integer): Double;
    procedure Step;
  end;

procedure LoadTrainingData(var Dataset: TTrainingDataset; const Filename: string);
procedure TrainTransformer(var Model: TTransformer; const Dataset: TTrainingDataset; Epochs: Integer; LearningRate: Double);
procedure TrainTransformerWithBackward(var Model: TTransformer; const Dataset: TTrainingDataset; Epochs: Integer; LearningRate: Double);
function CalculateLoss(const Output, Target: TDoubleMatrix): Double;
function CreateContextEnrichedInput(const inputText: ucs4; const context: string): TDoubleMatrix;
procedure BatchTrainFromLogs(const LogFilename: string; var TransformerModel: TTransformer);
function CalculateGradient(const Output, Target: TDoubleMatrix): TDoubleMatrix;
procedure ResetAttentionGradients(var Attention: TMultiHeadAttention);
procedure SaveModelWeights(const Model: TTransformer; const Filename: string);
procedure LoadModelWeights(var Model: TTransformer; const Filename: string);
procedure TrainTransformerWithRegularization(var Model: TTransformer; const Dataset: TTrainingDataset; Epochs: Integer; LearningRate: Double);
procedure SplitDataset(const Dataset: TTrainingDataset; out TrainData, ValData: TTrainingDataset; ValidationSplit: Double);
procedure TrainTransformerWithValidation(var Model: TTransformer; const TrainData, ValData: TTrainingDataset; LearningRate: Double; Epochs: Integer; BatchSize: Integer);
function TrainEpoch(var Model: TTransformer; const Dataset: TTrainingDataset; LearningRate: Double; BatchSize: Integer): Double;
procedure CreateBatch(const Dataset: TTrainingDataset; StartIndex: Integer; BatchSize: Integer; out Inputs, Targets: TDoubleMatrix);
function CountTokens(const text: string): Integer;
procedure FillExampleMatrix(var Matrix: TDoubleMatrix; StartRow: Integer; const text: string; maxTokens, embeddingSize: Integer);
function EvaluateModel(const M: TTransformer; const Dataset: TTrainingDataset): Double;
procedure SaveModel(const Model: TTransformer; const Filename: string);
procedure LoadModel(var Model: TTransformer; const Filename: string);
procedure ProcessBatch(var Model: TTransformer; const Batch: TTrainingDataset);
procedure ProcessTrainingBatch(var Model: TTransformer; const Batch: TTrainingDataset; LearningRate: Double; out BatchLoss: Double);
function CombineMatricesVertically(const Matrices: array of TDoubleMatrix): TDoubleMatrix;
function CreateMiniBatches(const Dataset: TTrainingDataset; BatchSize: Integer): TTrainingDatasetArray;
function CalculateOptimalBatchSize(AvailableMemory: Integer; ModelSize: Integer): Integer;
procedure ProcessTrainingBatchWithAccumulation(var Model: TTransformer; const Batch: TTrainingDataset; LearningRate: Double; GradientAccumulationSteps: Integer; out BatchLoss: Double);
procedure ProcessMiniBatch(var Model: TTransformer; const Batch: TTrainingDataset; Step, TotalSteps: Integer; out inputs, targets, outputs, gradOutput: TDoubleMatrix);
procedure ProcessTrainingBatch(var Model: TTransformer; const Batch: TTrainingDataset; LearningRate: Double; out inputs, targets, outputs, gradOutput: TDoubleMatrix);
function TrainEpochWithBatches(var Model: TTransformer; const Dataset: TTrainingDataset; LearningRate: Double; BatchSize: Integer): Double;
procedure TrainTransformerWithIndexing(var Model: TTransformer; 
                                     const Dataset: TTrainingDataset; 
                                     Epochs: Integer; 
                                     LearningRate: Double);
procedure ValidateModel(var Model: TTransformer);
function ValidateModelStructure(var Model: TTransformer): Boolean;

implementation

procedure LoadTrainingData(var Dataset: TTrainingDataset; const Filename: string);
var
  F: TextFile;
  Line: string;
  CurrentExample: Integer;
begin
  if not FileExists(Filename) then begin
    WriteLn('Файл обучения не найден: ', Filename);
    Exit;
  end;
  AssignFile(F, Filename);
  Reset(F);
  CurrentExample := -1;
  SetLength(Dataset, 0);
  while not Eof(F) do begin
    ReadLn(F, Line);
    Line := Line.Trim;
    if Line = '---' then begin
      // Новый пример
      SetLength(Dataset, Length(Dataset) + 1);
      CurrentExample := High(Dataset);
      Dataset[CurrentExample].Context := '';
    end
    else if (CurrentExample >= 0) then begin
      if Line.StartsWith('В: ') then
        Dataset[CurrentExample].Input := Copy(Line, 4, MaxInt)
      else if Line.StartsWith('О: ') then
        Dataset[CurrentExample].ExpectedOutput := Copy(Line, 4, MaxInt)
      else if Line.StartsWith('К: ') then
        Dataset[CurrentExample].Context := Dataset[CurrentExample].Context + Copy(Line, 4, MaxInt) + #10;
    end;
  end;
  CloseFile(F);
  WriteLn('Загружено примеров обучения: ', Length(Dataset));
end;

procedure ApplyRandomWeightUpdate(var Model: TTransformer; LearningRate: Double);
var
  i, j, k: Integer;
begin
  // Случайное обновление embedding матрицы
  for i := 0 to High(Model.Embedding) do
    for j := 0 to High(Model.Embedding[i]) do
      Model.Embedding[i][j] := Model.Embedding[i][j] + 
        (Random * 2 - 1) * LearningRate;

  // Случайное обновление весов слоев
  for i := 0 to High(Model.Layers) do begin
    // Attention weights
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do begin
      AddNoise(Model.Layers[i].SelfAttention.Heads[j].Wq, LearningRate * 0.1);
      AddNoise(Model.Layers[i].SelfAttention.Heads[j].Wk, LearningRate * 0.1);
      AddNoise(Model.Layers[i].SelfAttention.Heads[j].Wv, LearningRate * 0.1);
      AddNoise(Model.Layers[i].SelfAttention.Heads[j].Wo, LearningRate * 0.1);
    end;

    // FFN weights
    AddNoise(Model.Layers[i].FFN_weights1, LearningRate * 0.1);
    AddNoise(Model.Layers[i].FFN_weights2, LearningRate * 0.1);
  end;
end;
procedure TrainTransformer(var Model: TTransformer; const Dataset: TTrainingDataset;
  Epochs: Integer; LearningRate: Double);
var
  epoch, i: Integer;
  inputMatrix, outputMatrix: TDoubleMatrix;
  totalLoss: Double;
begin
  WriteLn('Начинаем обучение трансформера (упрощенная версия)...');
  WriteLn('Примеров для обучения: ', Length(Dataset));

  for epoch := 1 to Epochs do begin
    totalLoss := 0;

    for i := 0 to High(Dataset) do begin
      try
        // Создаем входные данные
        inputMatrix := CreateContextEnrichedInput(
          Dataset[i].Input, 
          Dataset[i].Context
        );

// В TrainTransformer добавить нормализацию
inputMatrix := ScaleMatrixCreate(inputMatrix, 0.01); // Уменьшаем значения

        // Прямой проход
        ForwardTransformer(Model, inputMatrix, outputMatrix);

        // Вычисляем потери (просто для мониторинга)
        totalLoss := totalLoss + MatrixNorm(outputMatrix);

        // Упрощенное обновление: случайное возмущение весов
        if Random(100) < 30 then // 30% chance
          ApplyRandomWeightUpdate(Model, LearningRate);

      except
        on E: Exception do
          WriteLn('Пропускаем пример ', i, ' из-за ошибки: ', E.Message);
      end;
    end;

    totalLoss := totalLoss / Length(Dataset);
    if epoch mod 10 = 0 then
      WriteLn('Эпоха: ', epoch, ' Loss: ', totalLoss:0:6);
  end;
end;

procedure TrainTransformerWithBackward(var Model: TTransformer; const Dataset: TTrainingDataset; Epochs: Integer; LearningRate: Double);
var
  epoch, i: Integer;
  inputMatrix, targetMatrix, outputMatrix, gradOutput: TDoubleMatrix;
  loss, totalLoss: Double;
begin
  // Проверяем размерности модели перед началом обучения
  CheckModelDimensions(Model);
  for epoch := 1 to Epochs do begin
    totalLoss := 0;
    for i := 0 to High(Dataset) do begin
      try
        WriteLn('Обработка примера ', i, ': "', Copy(Dataset[i].Input, 1, 30), '"');
        // Пропускаем пустые примеры
        if (Dataset[i].Input = '') or (Dataset[i].ExpectedOutput = '') then begin
          WriteLn('  Пропускаем пустой пример');
          Continue;
        end;
        inputMatrix := CreateContextEnrichedInput(
          Dataset[i].Input,
          Dataset[i].Context
        );
        targetMatrix := TextsToMatrix(
          [Dataset[i].ExpectedOutput],
          Model.Config.InputSize
        );
      // Проверяем размерности
      if (Length(inputMatrix) = 0) or (Length(targetMatrix) = 0) then begin
        WriteLn('  Пустой input или target, пропускаем');
        Continue;
      end;
      if Length(inputMatrix[0]) <> Model.Config.InputSize then begin
        WriteLn('  Несовпадение размерности input: ', Length(inputMatrix[0]), 
                ' != ', Model.Config.InputSize);
        Continue;
      end;

      if Length(targetMatrix[0]) <> Model.Config.InputSize then begin
        WriteLn('  Несовпадение размерности target: ', Length(targetMatrix[0]), 
                ' != ', Model.Config.InputSize);
        Continue;
      end;
        // Прямой проход
        ForwardTransformer(Model, inputMatrix, outputMatrix);
        // Проверяем выход
        if (Length(outputMatrix) = 0) or (Length(outputMatrix[0]) = 0) then begin
          WriteLn('  Пустой output, пропускаем пример');
          Continue;
        end;
        // Вычисление потерь
        loss := CalculateLoss(outputMatrix, targetMatrix);
        totalLoss := totalLoss + loss;
        // Вычисление градиента
        gradOutput := CalculateGradient(outputMatrix, targetMatrix);
        // Обратный проход
        BackwardTransformer(Model, inputMatrix, gradOutput);
        // Обновление весов
        UpdateTransformer(Model, LearningRate);
        // Очистка градиентов
        ResetGradients(Model);
        WriteLn('  Loss: ', loss:0:6);
      except
        on E: Exception do
        begin
          WriteLn('  ОШИБКА в примере ', i, ': ', E.Message);
          WriteLn('  Пропускаем пример и продолжаем...');
HALT;
        end;
      end;
    end;
    if Length(Dataset) > 0 then
      totalLoss := totalLoss / Length(Dataset)
    else
      totalLoss := 0;
    WriteLn('  Средний Loss: ', totalLoss:0:6);
  end;
end;

function CalculateLoss(const Output, Target: TDoubleMatrix): Double;
var 
  i, j: Integer;
  diff, totalElements: Double;
begin
  WriteLn('CalculateLoss:');
  WriteLn('  Output: ', Length(Output), 'x', Length(Output[0]));
  WriteLn('  Target: ', Length(Target), 'x', Length(Target[0]));

  Result := 0;
  totalElements := 0;

  // ✅ ИСПРАВЛЕНИЕ: Безопасное вычисление MSE
  for i := 0 to Min(High(Output), High(Target)) do 
  begin
    for j := 0 to Min(High(Output[i]), High(Target[i])) do
    begin
      diff := Output[i][j] - Target[i][j];
      Result := Result + diff * diff; // Квадрат разности
      totalElements := totalElements + 1;
    end;
  end;

  // ✅ ИСПРАВЛЕНИЕ: Делим на общее количество элементов
  if totalElements > 0 then
    Result := Result / totalElements
  else
    Result := 0.0;

  WriteLn('  Loss: ', Result:0:6);
end;

function CreateContextEnrichedInput(const inputText: ucs4; const context: string): TDoubleMatrix;
var
  enrichedText: string;
  cleanInput: ucs4;
begin
  // Очищаем входной текст
  cleanInput := Trim(inputText);

  if cleanInput.Length = 0 then begin
    WriteLn('ERROR: Empty input text after trimming');
    SetLength(Result, 1, 300);
    FillMatrix(Result, 0.0);
    Exit;
  end;

  // Преобразуем в строку для обработки
  enrichedText := cleanInput.ToUTF8;

  // Добавляем контекст если есть
  if context.Trim <> '' then
    enrichedText := context.Trim + #10 + 'Текущее сообщение: ' + enrichedText;

  WriteLn('CreateContextEnrichedInput: processing: "', Copy(enrichedText, 1, 50), '"');

  // Создаем эмбеддинг
  Result := TextsToMatrix([enrichedText], 300);

  // ✅ ИСПРАВЛЕНИЕ: НОРМАЛИЗУЕМ ВХОДНЫЕ ДАННЫЕ
  if (Length(Result) > 0) and (Length(Result[0]) > 0) then
  begin
    // Масштабируем значения чтобы избежать взрыва градиентов
    Result := ScaleMatrixCreate(Result, 0.01); // Уменьшаем в 100 раз
  end;

  // Двойная проверка
  if (Length(Result) = 0) or (Length(Result[0]) = 0) then begin
    WriteLn('CRITICAL: Fallback to default embedding');
    SetLength(Result, 1, 300);
    FillMatrix(Result, 0.0);
  end;
end;

procedure BatchTrainFromLogs(const LogFilename: string; var TransformerModel: TTransformer);
var
  LogFile: TStringList;
  i: Integer;
  UserMsg, AIMsg: string;
  TrainingData: TTrainingDataset;
begin
  LogFile := TStringList.Create;
  try
    LogFile.LoadFromFile(LogFilename);
    for i := 0 to LogFile.Count - 1 do begin
      if LogFile[i].StartsWith('USER: ') then
        UserMsg := Copy(LogFile[i], 7, MaxInt)
      else if LogFile[i].StartsWith('AI: ') then begin
        AIMsg := Copy(LogFile[i], 5, MaxInt);
        // Добавляем в данные обучения
        SetLength(TrainingData, Length(TrainingData) + 1);
        TrainingData[High(TrainingData)].Input := UserMsg;
        TrainingData[High(TrainingData)].ExpectedOutput := AIMsg;
      end;
    end;
    if Length(TrainingData) > 0 then
      TrainTransformer(TransformerModel, TrainingData, 50, 0.0005);
  finally
    LogFile.Free;
  end;
end;

function CalculateGradient(const Output, Target: TDoubleMatrix): TDoubleMatrix;
var 
  i, j: Integer;
  totalElements: Double;
begin
  WriteLn('CalculateGradient:');
  WriteLn('  Output: ', Length(Output), 'x', Length(Output[0]));
  WriteLn('  Target: ', Length(Target), 'x', Length(Target[0]));

  // Градиент MSE loss: dL/doutput = 2*(output - target) / n
  SetLength(Result, Length(Output), Length(Output[0]));
  totalElements := Length(Output) * Length(Output[0]);

  // ✅ ИСПРАВЛЕНИЕ: Безопасное вычисление с проверкой границ
  for i := 0 to Min(High(Output), High(Result)) do 
  begin
    for j := 0 to Min(High(Output[i]), High(Result[i])) do
    begin
      if (i <= High(Target)) and (j <= High(Target[i])) then
        Result[i][j] := 2.0 * (Output[i][j] - Target[i][j]) / totalElements
      else
        Result[i][j] := 0.0;
    end;
  end;

  WriteLn('  Gradient: ', Length(Result), 'x', Length(Result[0]));
end;

procedure ResetAttentionGradients(var Attention: TMultiHeadAttention);
var i: Integer;
begin
  for i := 0 to High(Attention.Heads) do begin
    FillMatrix(Attention.Heads[i].dWq, 0.0);
    FillMatrix(Attention.Heads[i].dWk, 0.0);
    FillMatrix(Attention.Heads[i].dWv, 0.0);
    FillMatrix(Attention.Heads[i].dWo, 0.0);
  end;
end;

procedure SaveModelWeights(const Model: TTransformer; const Filename: string);
var
  F: File;
  i, j, k, layer: Integer;
  value: Double;
begin
  AssignFile(F, Filename);
  try
    Rewrite(F, 1); // Binary mode

    // 1. Сохраняем конфигурацию
    BlockWrite(F, Model.Config.InputSize, SizeOf(Integer));
    BlockWrite(F, Model.Config.NumLayers, SizeOf(Integer));
    BlockWrite(F, Model.Config.NumHeads, SizeOf(Integer));
    BlockWrite(F, Model.Config.FFNDim, SizeOf(Integer));
    BlockWrite(F, Model.Config.MaxSeqLength, SizeOf(Integer));
    BlockWrite(F, Model.Config.DropoutRate, SizeOf(Double));

    // 2. Сохраняем embedding матрицу
    for i := 0 to High(Model.Embedding) do
      for j := 0 to High(Model.Embedding[i]) do begin
        value := Model.Embedding[i][j];
        BlockWrite(F, value, SizeOf(Double));
      end;

    // 3. Сохраняем веса всех слоев
    for layer := 0 to High(Model.Layers) do begin
      // Attention weights
      for i := 0 to High(Model.Layers[layer].SelfAttention.Heads) do begin
        // Wq
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wq[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;

        // Wk
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wk[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;

        // Wv
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wv[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;

        // Wo
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wo[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;
      end;

      // FFN weights
      for i := 0 to High(Model.Layers[layer].FFN_weights1) do
        for j := 0 to High(Model.Layers[layer].FFN_weights1[i]) do begin
          value := Model.Layers[layer].FFN_weights1[i][j];
          BlockWrite(F, value, SizeOf(Double));
        end;

      for i := 0 to High(Model.Layers[layer].FFN_weights2) do
        for j := 0 to High(Model.Layers[layer].FFN_weights2[i]) do begin
          value := Model.Layers[layer].FFN_weights2[i][j];
          BlockWrite(F, value, SizeOf(Double));
        end;

      // LayerNorm parameters
      for i := 0 to High(Model.Layers[layer].Norm1_Gamma) do begin
        value := Model.Layers[layer].Norm1_Gamma[i];
        BlockWrite(F, value, SizeOf(Double));
      end;

      for i := 0 to High(Model.Layers[layer].Norm1_Beta) do begin
        value := Model.Layers[layer].Norm1_Beta[i];
        BlockWrite(F, value, SizeOf(Double));
      end;

      for i := 0 to High(Model.Layers[layer].Norm2_Gamma) do begin
        value := Model.Layers[layer].Norm2_Gamma[i];
        BlockWrite(F, value, SizeOf(Double));
      end;

      for i := 0 to High(Model.Layers[layer].Norm2_Beta) do begin
        value := Model.Layers[layer].Norm2_Beta[i];
        BlockWrite(F, value, SizeOf(Double));
      end;
    end;

    WriteLn('Модель сохранена в файл: ', Filename);

  finally
    CloseFile(F);
  end;
end;

procedure LoadModelWeights(var Model: TTransformer; const Filename: string);
var
  F: File;
  i, j, k, layer, inputSize, numLayers, numHeads, ffnDim, maxSeqLength: Integer;
  dropoutRate: Double;
  value: Double;
begin
  if not FileExists(Filename) then
    raise Exception.Create('Файл модели не найден: ' + Filename);

  AssignFile(F, Filename);
  try
    Reset(F, 1); // Binary mode

    // 1. Загружаем конфигурацию
    BlockRead(F, inputSize, SizeOf(Integer));
    BlockRead(F, numLayers, SizeOf(Integer));
    BlockRead(F, numHeads, SizeOf(Integer));
    BlockRead(F, ffnDim, SizeOf(Integer));
    BlockRead(F, maxSeqLength, SizeOf(Integer));
    BlockRead(F, dropoutRate, SizeOf(Double));

    // Проверяем совместимость конфигурации
    if (inputSize <> Model.Config.InputSize) or
       (numLayers <> Model.Config.NumLayers) or
       (numHeads <> Model.Config.NumHeads) then
      raise Exception.Create('Несовместимая конфигурация модели');

    // 2. Загружаем embedding матрицу
    for i := 0 to High(Model.Embedding) do
      for j := 0 to High(Model.Embedding[i]) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Embedding[i][j] := value;
      end;

    // 3. Загружаем веса всех слоев
    for layer := 0 to High(Model.Layers) do begin
      // Attention weights
      for i := 0 to High(Model.Layers[layer].SelfAttention.Heads) do begin
        // Wq
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wq[j][k] := value;
          end;

        // Wk
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wk[j][k] := value;
          end;

        // Wv
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wv[j][k] := value;
          end;

        // Wo
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wo[j][k] := value;
          end;
      end;

      // FFN weights
      for i := 0 to High(Model.Layers[layer].FFN_weights1) do
        for j := 0 to High(Model.Layers[layer].FFN_weights1[i]) do begin
          BlockRead(F, value, SizeOf(Double));
          Model.Layers[layer].FFN_weights1[i][j] := value;
        end;

      for i := 0 to High(Model.Layers[layer].FFN_weights2) do
        for j := 0 to High(Model.Layers[layer].FFN_weights2[i]) do begin
          BlockRead(F, value, SizeOf(Double));
          Model.Layers[layer].FFN_weights2[i][j] := value;
        end;

      // LayerNorm parameters
      for i := 0 to High(Model.Layers[layer].Norm1_Gamma) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm1_Gamma[i] := value;
      end;

      for i := 0 to High(Model.Layers[layer].Norm1_Beta) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm1_Beta[i] := value;
      end;

      for i := 0 to High(Model.Layers[layer].Norm2_Gamma) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm2_Gamma[i] := value;
      end;

      for i := 0 to High(Model.Layers[layer].Norm2_Beta) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm2_Beta[i] := value;
      end;
    end;

    WriteLn('Модель загружена из файла: ', Filename);

  finally
    CloseFile(F);
  end;
end;

procedure TrainTransformerWithRegularization(var Model: TTransformer; const Dataset: TTrainingDataset; Epochs: Integer; LearningRate: Double);
var
  scheduler: TLearningRateScheduler;
  epoch, i: Integer;
currentLR:Double;
begin
  scheduler := TLearningRateScheduler.Create(LearningRate, 1000, 10000);

  for epoch := 1 to Epochs do begin
    currentLR := scheduler.GetLearningRate(scheduler.FStep);

    for i := 0 to High(Dataset) do begin
      try
        // ... прямой и обратный проход ...

        // Применяем gradient clipping ко всей модели
        ApplyGradientClippingToModel(Model, Model.Config.GradientClipValue);

        // Обновление с регуляризацией
        UpdateTransformer(Model, currentLR);

        scheduler.Step;

      except
        on E: Exception do begin
          WriteLn('TrainTransformerWithRegularization: Training error: ', E.Message);
Halt;
        end;
      end;
    end;
  end;

  scheduler.Free;
end;

procedure SplitDataset(const Dataset: TTrainingDataset; out TrainData, ValData: TTrainingDataset; ValidationSplit: Double);
var i, splitIndex: Integer;
begin
  splitIndex := Round(Length(Dataset) * (1 - ValidationSplit));
  SetLength(TrainData, splitIndex);
  SetLength(ValData, Length(Dataset) - splitIndex);
  for i := 0 to splitIndex - 1 do
    TrainData[i] := Dataset[i];
  for i := splitIndex to High(Dataset) do
    ValData[i - splitIndex] := Dataset[i];
end;

procedure TrainTransformerWithValidation(var Model: TTransformer; const TrainData, ValData: TTrainingDataset; LearningRate: Double; Epochs: Integer; BatchSize: Integer);
var
  epoch: Integer;
  trainLoss, valLoss: Double;
  bestValLoss: Double;
  bestModel: TTransformer;
begin
  bestValLoss := MaxDouble;

  for epoch := 1 to Epochs do begin
    // Обучение на training set
    trainLoss := TrainEpoch(Model, TrainData, LearningRate, BatchSize);

    // Валидация
    valLoss := EvaluateModel(Model, ValData);

    WriteLn('Epoch ', epoch, ': Train Loss=', trainLoss:0:4, 
            ', Val Loss=', valLoss:0:4);

    // Сохраняем лучшую модель
    if valLoss < bestValLoss then begin
      bestValLoss := valLoss;
      bestModel := Model; // Нужно реализовать копирование модели
      WriteLn('Новый лучший результат! Сохраняем модель...');
      SaveModel(Model, 'best_model.bin');
    end;

    // Early stopping при переобучении
    if (epoch > 5) and (valLoss > trainLoss * 1.5) then begin
      WriteLn('Early stopping at epoch ', epoch);
      Break;
    end;
  end;

  // Загружаем лучшую модель
  if bestValLoss < MaxDouble then
  begin
    WriteLn('Загружаем лучшую модель с val loss=', bestValLoss:0:4);
    LoadModel(Model, 'best_model.bin');
  end;
end;

procedure AdjustTargetSize(var target: TDoubleMatrix; desiredRows: Integer);
var
  i, j: Integer;
  newTarget: TDoubleMatrix;
begin
  if Length(target) = desiredRows then
    Exit;

  WriteLn('AdjustTargetSize: ', Length(target), 'x', Length(target[0]), 
          ' -> ', desiredRows, 'x', Length(target[0]));

  SetLength(newTarget, desiredRows, Length(target[0]));

  for i := 0 to desiredRows - 1 do begin
    if i < Length(target) then
      newTarget[i] := Copy(target[i], 0, Length(target[0]))
    else
      for j := 0 to High(newTarget[0]) do
        newTarget[i][j] := 0.0; // Заполняем нулями
  end;

  target := newTarget;
end;

function TrainEpoch(var Model: TTransformer; const Dataset: TTrainingDataset; LearningRate: Double; BatchSize: Integer): Double;
var
  i, validExamples: Integer;
  totalLoss, loss, actualLearningRate, gradNorm: Double;
  inputMatrix, targetMatrix, outputMatrix, gradOutput: TDoubleMatrix;
begin
  totalLoss := 0.0;
  validExamples := 0;

  // ✅ ИСПРАВЛЕНИЕ: УМЕНЬШАЕМ LEARNING RATE
  actualLearningRate := LearningRate * 0.001; // В 1000 раз меньше

  for i := 0 to High(Dataset) do begin
    try
      // Пропускаем пустые примеры
      if (Dataset[i].Input = '') or (Dataset[i].ExpectedOutput = '') then
        Continue;

      WriteLn('TrainEpoch: пример ', i, ': "', Copy(Dataset[i].Input, 1, 30), '"');

      inputMatrix := CreateContextEnrichedInput(Dataset[i].Input, Dataset[i].Context);
      targetMatrix := TextsToMatrix([Dataset[i].ExpectedOutput], Model.Config.InputSize);

      // Выравниваем размеры
      if Length(inputMatrix) <> Length(targetMatrix) then
        AdjustTargetSize(targetMatrix, Length(inputMatrix));

      // Прямой проход
      ForwardTransformer{Optimized}(Model, inputMatrix, outputMatrix, nil, True);

      // Вычисление потерь
      loss := CalculateLoss(outputMatrix, targetMatrix);
      totalLoss := totalLoss + loss;

      // Вычисление градиента
      gradOutput := CalculateGradient(outputMatrix, targetMatrix);

      // ✅ ИСПРАВЛЕНИЕ: GRADIENT CLIPPING
      gradNorm := MatrixNorm(gradOutput);
      if gradNorm > 1.0 then
      begin
        WriteLn('  Gradient clipping: ', gradNorm:0:4, ' -> 1.0');
        ScaleMatrix(gradOutput, 1.0 / gradNorm);
      end;

      // Обратный проход
      BackwardTransformer(Model, inputMatrix, gradOutput);

      // Обновление весов с уменьшенным LR
      UpdateTransformer(Model, actualLearningRate);

      Inc(validExamples);
      WriteLn('  Loss: ', loss:0:6);

    except
      on E: Exception do
      begin
        WriteLn('  ОШИБКА в примере ', i, ': ', E.Message);
      end;
    end;
  end;

  if validExamples > 0 then
    Result := totalLoss / validExamples
  else
    Result := 0.0;
end;

procedure CreateBatch(const Dataset: TTrainingDataset; StartIndex: Integer; BatchSize: Integer; out Inputs, Targets: TDoubleMatrix);
var
  i, actualBatchSize: Integer;
  inputTexts, targetTexts: TUC4Array;
  maxInputTokens, maxTargetTokens: Integer;
begin
  actualBatchSize := Min(BatchSize, Length(Dataset) - StartIndex);

  // Находим максимальное количество токенов
  maxInputTokens := 0;
  maxTargetTokens := 0;
  for i := 0 to actualBatchSize - 1 do begin
    maxInputTokens := Max(maxInputTokens, CountTokens(Dataset[StartIndex + i].Input));
    maxTargetTokens := Max(maxTargetTokens, CountTokens(Dataset[StartIndex + i].ExpectedOutput));
  end;

  // Создаем матрицы фиксированного размера
  SetLength(Inputs, actualBatchSize * maxInputTokens, 300);
  SetLength(Targets, actualBatchSize * maxTargetTokens, 300);

  // Заполняем матрицы
  for i := 0 to actualBatchSize - 1 do begin
    FillExampleMatrix(Inputs, i * maxInputTokens, Dataset[StartIndex + i].Input, maxInputTokens, 300);
    FillExampleMatrix(Targets, i * maxTargetTokens, Dataset[StartIndex + i].ExpectedOutput, maxTargetTokens, 300);
  end;
end;

function CountTokens(const text: string): Integer;
var tokens: TUC4Array;
begin
tokens := TokenizeForNLP(text);
Result := Length(tokens);
end;

procedure FillExampleMatrix(var Matrix: TDoubleMatrix; StartRow: Integer; const text: string; maxTokens, embeddingSize: Integer);
var
  tokens: TUC4Array;
  embeddings: TDoubleMatrix;
  i, j: Integer;
begin
  tokens := TokenizeForNLP(text);
  embeddings := CreateTokenEmbeddings(tokens, embeddingSize);
  for i := 0 to High(embeddings) do begin
    if StartRow + i < Length(Matrix) then
    begin
      for j := 0 to embeddingSize - 1 do begin
        if j < Length(embeddings[i]) then
          Matrix[StartRow + i][j] := embeddings[i][j]
        else
          Matrix[StartRow + i][j] := 0.0;
      end;
    end;
  end;
  // Заполняем оставшиеся токены нулями (padding)
  for i := Length(embeddings) to maxTokens - 1 do begin
    if StartRow + i < Length(Matrix) then
      FillArray(Matrix[StartRow + i], 0.0);
  end;
end;

constructor TLearningRateScheduler.Create(InitialLR: Double; WarmupSteps: Integer; DecaySteps: Integer);
begin
  FInitialLR := InitialLR;
  FCurrentLR := InitialLR;
  FWarmupSteps := WarmupSteps;
  FDecaySteps := DecaySteps;
  FStep := 0;
end;

function TLearningRateScheduler.GetLearningRate(Step: Integer): Double;
begin
  if Step < FWarmupSteps then
    // Linear warmup
    Result := FInitialLR * (Step / FWarmupSteps)
  else
    // Cosine decay
    Result := FInitialLR * 0.5 * (1 + Cos(Pi * (Step - FWarmupSteps) / FDecaySteps));
end;

procedure TLearningRateScheduler.Step;
begin
  Inc(FStep);
  FCurrentLR := GetLearningRate(FStep);
end;

function EvaluateModel(const M: TTransformer; const Dataset: TTrainingDataset): Double;
var
  i, validExamples: Integer;
  totalLoss: Double;
  inputMatrix, outputMatrix, targetMatrix: TDoubleMatrix;
Model: TTransformer; 
begin
Model := M;
  if Length(Dataset) = 0 then begin
    WriteLn('ERROR: Empty dataset in EvaluateModel');
    Exit(MaxDouble);
  end;
  totalLoss := 0.0;
  validExamples := 0;
  for i := 0 to High(Dataset) do begin
    try
      // Пропускаем пустые примеры
      if (Dataset[i].Input.Length = 0) or (Dataset[i].ExpectedOutput.Length = 0) then
        Continue;
      inputMatrix := CreateContextEnrichedInput(Dataset[i].Input, Dataset[i].Context);
      // Проверяем, что матрица создалась правильно
      if (Length(inputMatrix) = 0) or (Length(inputMatrix[0]) = 0) then begin
        WriteLn('Warning: Empty input matrix for example ', i);
        Continue;
      end;
      targetMatrix := TextsToMatrix([Dataset[i].ExpectedOutput], Model.Config.InputSize);
      // Проверяем целевую матрицу
      if (Length(targetMatrix) = 0) or (Length(targetMatrix[0]) = 0) then begin
        WriteLn('Warning: Empty target matrix for example ', i);
        Continue;
      end;
      ForwardTransformer(Model, inputMatrix, outputMatrix, nil, False); // isTraining = False
      totalLoss := totalLoss + CalculateLoss(outputMatrix, targetMatrix);
      Inc(validExamples);
    except
      on E: Exception do
        WriteLn('Evaluation error for example ', i, ': ', E.Message);
    end;
  end;
  if validExamples > 0 then
    Result := totalLoss / validExamples
  else
    Result := MaxDouble; // Очень плохой результат если нет валидных примеров
end;

function SafeLoadModel(var Model: TTransformer; const Filename: string): Boolean;
begin
  Result := False;
  try
    LoadModel(Model, Filename);
    Result := True;
    WriteLn('Модель загружена успешно');
  except
    on E: Exception do
    begin
      WriteLn('Не удалось загрузить модель: ', E.Message);
      WriteLn('Продолжаем с текущими весами');
      Result := False;
    end;
  end;
end;

procedure SaveModel(const Model: TTransformer; const Filename: string);
var
  F: File;
  i, j, k, layer: Integer;
  value: Double;
  config: TTransformerConfig;
begin
  WriteLn('SaveModel: сохранение модели в ', Filename);

  AssignFile(F, Filename);
  try
    Rewrite(F, 1); // Binary mode

    // 1. Сохраняем конфигурацию
    config := Model.Config;
    BlockWrite(F, config, SizeOf(config));

    // 2. Сохраняем embedding матрицу
    WriteLn('  Сохранение embedding матрицы: ', 
            Length(Model.Embedding), 'x', Length(Model.Embedding[0]));
    for i := 0 to High(Model.Embedding) do begin
      for j := 0 to High(Model.Embedding[i]) do begin
        value := Model.Embedding[i][j];
        BlockWrite(F, value, SizeOf(Double));
      end;
    end;

    // 3. Сохраняем веса всех слоев
    WriteLn('  Сохранение ', Length(Model.Layers), ' слоев...');
    for layer := 0 to High(Model.Layers) do begin
      WriteLn('    Слой ', layer, ':');

      // Attention weights
      WriteLn('      Attention heads: ', Length(Model.Layers[layer].SelfAttention.Heads));
      for i := 0 to High(Model.Layers[layer].SelfAttention.Heads) do begin
        // Wq
        WriteLn('        Head ', i, ' Wq: ', 
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wq), 'x',
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wq[0]));
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wq[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;

        // Wk
        WriteLn('        Head ', i, ' Wk: ', 
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wk), 'x',
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wk[0]));
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wk[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;

        // Wv
        WriteLn('        Head ', i, ' Wv: ', 
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wv), 'x',
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wv[0]));
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wv[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;

        // Wo
        WriteLn('        Head ', i, ' Wo: ', 
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wo), 'x',
                Length(Model.Layers[layer].SelfAttention.Heads[i].Wo[0]));
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo[j]) do begin
            value := Model.Layers[layer].SelfAttention.Heads[i].Wo[j][k];
            BlockWrite(F, value, SizeOf(Double));
          end;
      end;

      // FFN weights
      WriteLn('      FFN1: ', 
              Length(Model.Layers[layer].FFN_weights1), 'x',
              Length(Model.Layers[layer].FFN_weights1[0]));
      for i := 0 to High(Model.Layers[layer].FFN_weights1) do
        for j := 0 to High(Model.Layers[layer].FFN_weights1[i]) do begin
          value := Model.Layers[layer].FFN_weights1[i][j];
          BlockWrite(F, value, SizeOf(Double));
        end;

      WriteLn('      FFN2: ', 
              Length(Model.Layers[layer].FFN_weights2), 'x',
              Length(Model.Layers[layer].FFN_weights2[0]));
      for i := 0 to High(Model.Layers[layer].FFN_weights2) do
        for j := 0 to High(Model.Layers[layer].FFN_weights2[i]) do begin
          value := Model.Layers[layer].FFN_weights2[i][j];
          BlockWrite(F, value, SizeOf(Double));
        end;

      // LayerNorm parameters
      WriteLn('      Norm1_Gamma: ', Length(Model.Layers[layer].Norm1_Gamma));
      for i := 0 to High(Model.Layers[layer].Norm1_Gamma) do begin
        value := Model.Layers[layer].Norm1_Gamma[i];
        BlockWrite(F, value, SizeOf(Double));
      end;

      WriteLn('      Norm1_Beta: ', Length(Model.Layers[layer].Norm1_Beta));
      for i := 0 to High(Model.Layers[layer].Norm1_Beta) do begin
        value := Model.Layers[layer].Norm1_Beta[i];
        BlockWrite(F, value, SizeOf(Double));
      end;

      WriteLn('      Norm2_Gamma: ', Length(Model.Layers[layer].Norm2_Gamma));
      for i := 0 to High(Model.Layers[layer].Norm2_Gamma) do begin
        value := Model.Layers[layer].Norm2_Gamma[i];
        BlockWrite(F, value, SizeOf(Double));
      end;

      WriteLn('      Norm2_Beta: ', Length(Model.Layers[layer].Norm2_Beta));
      for i := 0 to High(Model.Layers[layer].Norm2_Beta) do begin
        value := Model.Layers[layer].Norm2_Beta[i];
        BlockWrite(F, value, SizeOf(Double));
      end;
    end;

    WriteLn('Модель успешно сохранена: ', Filename);

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА сохранения модели: ', E.Message);
      raise;
    end;
  end;
CloseFile(F);
end;

procedure LoadModel(var Model: TTransformer; const Filename: string);
var
  F: File;
  i, j, k, layer: Integer;
  value: Double;
  config: TTransformerConfig;
begin
  if not FileExists(Filename) then
    raise Exception.Create('Файл модели не найден: ' + Filename);
  WriteLn('LoadModel: загрузка модели из ', Filename);
  AssignFile(F, Filename);
  try
    Reset(F, 1); // Binary mode
    // 1. Загружаем конфигурацию
    BlockRead(F, config, SizeOf(config));
    // Проверяем совместимость конфигурации
    if (config.InputSize <> Model.Config.InputSize) or
       (config.NumLayers <> Model.Config.NumLayers) or
       (config.NumHeads <> Model.Config.NumHeads) then
    begin
      WriteLn('Предупреждение: Несовместимая конфигурация модели');
      WriteLn('  Файл: InputSize=', config.InputSize, 
              ', NumLayers=', config.NumLayers, 
              ', NumHeads=', config.NumHeads);
      WriteLn('  Текущая: InputSize=', Model.Config.InputSize,
              ', NumLayers=', Model.Config.NumLayers,
              ', NumHeads=', Model.Config.NumHeads);
    end;
    // 2. Загружаем embedding матрицу
    WriteLn('  Загрузка embedding матрицы...');
    for i := 0 to High(Model.Embedding) do begin
      for j := 0 to High(Model.Embedding[i]) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Embedding[i][j] := value;
      end;
    end;
    // 3. Загружаем веса всех слоев
    WriteLn('  Загрузка ', Length(Model.Layers), ' слоев...');
    for layer := 0 to High(Model.Layers) do
    begin
      WriteLn('    Слой ', layer, ':');
      // Attention weights
      WriteLn('      Attention heads...');
      for i := 0 to High(Model.Layers[layer].SelfAttention.Heads) do begin
        // Wq
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wq[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wq[j][k] := value;
          end;

        // Wk
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wk[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wk[j][k] := value;
          end;

        // Wv
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wv[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wv[j][k] := value;
          end;

        // Wo
        for j := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo) do
          for k := 0 to High(Model.Layers[layer].SelfAttention.Heads[i].Wo[j]) do begin
            BlockRead(F, value, SizeOf(Double));
            Model.Layers[layer].SelfAttention.Heads[i].Wo[j][k] := value;
          end;
      end;

      // FFN weights
      WriteLn('      FFN weights...');
      for i := 0 to High(Model.Layers[layer].FFN_weights1) do
        for j := 0 to High(Model.Layers[layer].FFN_weights1[i]) do begin
          BlockRead(F, value, SizeOf(Double));
          Model.Layers[layer].FFN_weights1[i][j] := value;
        end;

      for i := 0 to High(Model.Layers[layer].FFN_weights2) do
        for j := 0 to High(Model.Layers[layer].FFN_weights2[i]) do begin
          BlockRead(F, value, SizeOf(Double));
          Model.Layers[layer].FFN_weights2[i][j] := value;
        end;

      // LayerNorm parameters
      WriteLn('      LayerNorm parameters...');
      for i := 0 to High(Model.Layers[layer].Norm1_Gamma) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm1_Gamma[i] := value;
      end;

      for i := 0 to High(Model.Layers[layer].Norm1_Beta) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm1_Beta[i] := value;
      end;

      for i := 0 to High(Model.Layers[layer].Norm2_Gamma) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm2_Gamma[i] := value;
      end;

      for i := 0 to High(Model.Layers[layer].Norm2_Beta) do begin
        BlockRead(F, value, SizeOf(Double));
        Model.Layers[layer].Norm2_Beta[i] := value;
      end;
    end;

    WriteLn('Модель успешно загружена: ', Filename);

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА загрузки модели: ', E.Message);
      // В случае ошибки инициализируем заново
      InitTransformer(Model, Model.Config);
      raise;
    end;
  end;
CloseFile(F);
end;

procedure ProcessBatch(var Model: TTransformer; const Batch: TTrainingDataset);
var
  i: Integer;
  inputs, targets, outputs: TDoubleMatrix;
  batchInputs, batchTargets: array of TDoubleMatrix;
begin
  SetLength(batchInputs, Length(Batch));
  SetLength(batchTargets, Length(Batch));

  // Подготавливаем все данные пакета
  for i := 0 to High(Batch) do begin
    batchInputs[i] := CreateContextEnrichedInput(Batch[i].Input, Batch[i].Context);
    batchTargets[i] := TextsToMatrix([Batch[i].ExpectedOutput], Model.Config.InputSize);
  end;

  // Объединяем в один большой пакет
  inputs := CombineExampleSequences(batchInputs);
  targets := CombineExampleSequences(batchTargets);

  // Один прямой+обратный проход для всего пакета
  ForwardTransformer(Model, inputs, outputs);
  // ... backward и update
end;

procedure ProcessTrainingBatch(var Model: TTransformer; const Batch: TTrainingDataset; LearningRate: Double; out BatchLoss: Double);
var
  i,j: Integer;
  inputs, targets, outputs, gradOutput: TDoubleMatrix;
  batchInputs, batchTargets: array of TDoubleMatrix;
begin
  BatchLoss := 0.0;

  if Length(Batch) = 0 then begin
    WriteLn('Ошибка: пустой пакет');
    Exit;
  end;

  try
    // 1. Подготавливаем данные пакета
    SetLength(batchInputs, Length(Batch));
    SetLength(batchTargets, Length(Batch));

    for i := 0 to High(Batch) do begin
      batchInputs[i] := CreateContextEnrichedInput(Batch[i].Input, Batch[i].Context);
      batchTargets[i] := TextsToMatrix([Batch[i].ExpectedOutput], Model.Config.InputSize);

      if Length(batchInputs[i]) <> Length(batchTargets[i]) then
        AdjustTargetSize(batchTargets[i], Length(batchInputs[i]));
    end;

    // 2. Объединяем в пакет
    inputs := CombineMatricesVertically(batchInputs);
    targets := CombineMatricesVertically(batchTargets);

    // 3. Прямой проход
    ForwardTransformer(Model, inputs, outputs, nil, True);

    // 4. Вычисление потерь
    for i := 0 to High(outputs) do
      for j := 0 to High(outputs[0]) do
        BatchLoss := BatchLoss + Sqr(outputs[i][j] - targets[i][j]);

    BatchLoss := BatchLoss / (Length(outputs) * Length(outputs[0]));

    // 5. Вычисление градиента
    SetLength(gradOutput, Length(outputs), Length(outputs[0]));
    for i := 0 to High(outputs) do
      for j := 0 to High(outputs[0]) do
        gradOutput[i][j] := 2.0 * (outputs[i][j] - targets[i][j]) / (Length(outputs) * Length(outputs[0]));

    // 6. Обратный проход и обновление весов
    BackwardTransformer(Model, inputs, gradOutput);
    UpdateTransformer(Model, LearningRate);

  except
    on E: Exception do
    begin
      WriteLn('Ошибка обработки пакета: ', E.Message);
      BatchLoss := MaxDouble;
    end;
  end;
end;

function CombineMatricesVertically(const Matrices: array of TDoubleMatrix): TDoubleMatrix;
var
  i, j, k, totalRows, cols, currentRow: Integer;
begin
  if Length(Matrices) = 0 then begin
    SetLength(Result, 0, 0);
    Exit;
  end;

  // Проверяем что все матрицы имеют одинаковое количество столбцов
  cols := Length(Matrices[0][0]);
  for i := 1 to High(Matrices) do begin
    if Length(Matrices[i][0]) <> cols then
      raise Exception.Create('Несовпадение количества столбцов при объединении матриц');
  end;

  // Вычисляем общее количество строк
  totalRows := 0;
  for i := 0 to High(Matrices) do
    Inc(totalRows, Length(Matrices[i]));

  // Создаем результирующую матрицу
  SetLength(Result, totalRows, cols);
  currentRow := 0;

  // Копируем данные
  for i := 0 to High(Matrices) do begin
    for j := 0 to High(Matrices[i]) do begin
      for k := 0 to cols - 1 do
      begin
        Result[currentRow][k] := Matrices[i][j][k];
      end;
      Inc(currentRow);
    end;
  end;
end;

function CreateMiniBatches(const Dataset: TTrainingDataset; BatchSize: Integer): TTrainingDatasetArray;
var i,j,numBatches: Integer;
begin
  numBatches := (Length(Dataset) + BatchSize - 1) div BatchSize;
  SetLength(Result, numBatches);
  for i := 0 to numBatches - 1 do begin
    SetLength(Result[i], Min(BatchSize, Length(Dataset) - i * BatchSize));
    for j := 0 to High(Result[i]) do Result[i][j] := Dataset[i * BatchSize + j];
  end;
end;

function CalculateOptimalBatchSize(AvailableMemory: Integer; ModelSize: Integer): Integer;
const
  // Консервативные значения по умолчанию
  MIN_BATCH_SIZE = 2;
  MAX_BATCH_SIZE = 16;
  DEFAULT_BATCH_SIZE = 4;
var
  estimatedMemoryPerExample: Integer;
  maxPossibleBatchSize: Integer;
begin
  // Быстрая оценка памяти на один пример
  // input_size * embedding_size * 8 bytes (double) * 3 (буферы)
  estimatedMemoryPerExample := ModelSize * 300 * 8 * 3;

  if estimatedMemoryPerExample <= 0 then begin
    WriteLn('Ошибка оценки памяти, используем размер пакета по умолчанию: ', DEFAULT_BATCH_SIZE);
    Exit(DEFAULT_BATCH_SIZE);
  end;

  // Вычисляем максимально возможный размер пакета
  maxPossibleBatchSize := AvailableMemory div estimatedMemoryPerExample;

  // Ограничиваем разумными значениями
  Result := Min(MAX_BATCH_SIZE, Max(MIN_BATCH_SIZE, maxPossibleBatchSize));

  WriteLn('Оптимальный размер пакета: ', Result);
  WriteLn('  Доступно памяти: ', AvailableMemory div (1024*1024), ' MB');
  WriteLn('  Память на пример: ~', estimatedMemoryPerExample div 1024, ' KB');
  WriteLn('  Теоретически возможно: ', maxPossibleBatchSize, ' примеров');
end;

procedure ProcessMiniBatch(var Model: TTransformer; const Batch: TTrainingDataset; Step, TotalSteps: Integer; out inputs, targets, outputs, gradOutput: TDoubleMatrix);
var
  i, startIdx, endIdx, batchSize: Integer;
  miniBatch: TTrainingDataset;
begin
  // Вычисляем диапазон для текущего мини-пакета
  batchSize := Length(Batch) div TotalSteps;
  startIdx := (Step - 1) * batchSize;
  endIdx := Min(startIdx + batchSize - 1, High(Batch));
  SetLength(miniBatch, endIdx - startIdx + 1);
  for i := startIdx to endIdx do
    miniBatch[i - startIdx] := Batch[i];
  // Обрабатываем мини-пакет
  ProcessTrainingBatch(Model, miniBatch, 0.0, inputs, targets, outputs, gradOutput);
end;

// Упрощенная версия ProcessTrainingBatch без обновления весов
procedure ProcessTrainingBatch(var Model: TTransformer; const Batch: TTrainingDataset; LearningRate: Double; out inputs, targets, outputs, gradOutput: TDoubleMatrix);
var
  i,j, batchSize: Integer;
  batchInputs, batchTargets: array of TDoubleMatrix;
  loss: Double;
begin
  batchSize := Length(Batch);

  // Подготавливаем данные (как в оригинальной ProcessTrainingBatch)
  SetLength(batchInputs, batchSize);
  SetLength(batchTargets, batchSize);

  for i := 0 to batchSize - 1 do begin
    batchInputs[i] := CreateContextEnrichedInput(Batch[i].Input, Batch[i].Context);
    batchTargets[i] := TextsToMatrix([Batch[i].ExpectedOutput], Model.Config.InputSize);

    if Length(batchInputs[i]) <> Length(batchTargets[i]) then
      AdjustTargetSize(batchTargets[i], Length(batchInputs[i]));
  end;

  // Объединяем в пакет
  inputs := CombineMatricesVertically(batchInputs);
  targets := CombineMatricesVertically(batchTargets);

  // Прямой проход
  ForwardTransformer(Model, inputs, outputs, nil, True);

  // Вычисляем градиент (но НЕ обновляем веса)
  SetLength(gradOutput, Length(outputs), Length(outputs[0]));
  for i := 0 to High(outputs) do
    for j := 0 to High(outputs[0]) do
      gradOutput[i][j] := 2.0 * (outputs[i][j] - targets[i][j]) / 
                         (Length(outputs) * Length(outputs[0]));

  // Обратный проход (градиенты вычисляются, но не применяются)
  BackwardTransformer(Model, inputs, gradOutput);
end;

// TrainerUnit.pas - упрощенная версия без сложного накопления
procedure ProcessTrainingBatchWithAccumulation(var Model: TTransformer; const Batch: TTrainingDataset; LearningRate: Double; GradientAccumulationSteps: Integer; out BatchLoss: Double);
var
  i,j: Integer;
  inputs, targets, outputs, gradOutput: TDoubleMatrix;
begin
  // Упрощенная версия: если шагов накопления > 1, делим learning rate
  if GradientAccumulationSteps > 1 then begin
    WriteLn('Упрощенное накопление градиентов: делим LR на ', GradientAccumulationSteps);
    ProcessTrainingBatch(Model, Batch, LearningRate / GradientAccumulationSteps, 
                        inputs, targets, outputs, gradOutput);

    // Вычисляем потери
    BatchLoss := 0.0;
    for i := 0 to High(outputs) do
      for j := 0 to High(outputs[0]) do
        BatchLoss := BatchLoss + Sqr(outputs[i][j] - targets[i][j]);

    BatchLoss := BatchLoss / (Length(outputs) * Length(outputs[0]));
  end
  else
  begin
    // Обычная обработка
    ProcessTrainingBatch(Model, Batch, LearningRate, inputs, targets, outputs, gradOutput);

    BatchLoss := 0.0;
    for i := 0 to High(outputs) do
      for j := 0 to High(outputs[0]) do
        BatchLoss := BatchLoss + Sqr(outputs[i][j] - targets[i][j]);

    BatchLoss := BatchLoss / (Length(outputs) * Length(outputs[0]));
  end;
end;

function TrainEpochWithBatches(var Model: TTransformer; const Dataset: TTrainingDataset; LearningRate: Double; BatchSize: Integer): Double;
var
  i, j, numBatches, currentIndex: Integer;
  batch: TTrainingDataset;
  batchLoss, totalLoss: Double;
  validBatches: Integer;
begin
  WriteLn('Обучение эпохи с пакетами (BatchSize=', BatchSize, ')');

  if Length(Dataset) = 0 then begin
    WriteLn('Ошибка: пустой датасет');
    Exit(MaxDouble);
  end;

  numBatches := (Length(Dataset) + BatchSize - 1) div BatchSize;
  totalLoss := 0.0;
  validBatches := 0;

  WriteLn('Всего примеров: ', Length(Dataset), ', пакетов: ', numBatches);

  for i := 0 to numBatches - 1 do begin
    try
      // Создаем текущий пакет
      currentIndex := i * BatchSize;
      SetLength(batch, Min(BatchSize, Length(Dataset) - currentIndex));

      for j := 0 to High(batch) do
        batch[j] := Dataset[currentIndex + j];

      WriteLn('Пакет ', i + 1, '/', numBatches, ' (', Length(batch), ' примеров)');

      // Обрабатываем пакет
      ProcessTrainingBatch(Model, batch, LearningRate, batchLoss);

      if batchLoss < MaxDouble then
      begin
        totalLoss := totalLoss + batchLoss;
        Inc(validBatches);
        WriteLn('  Потери пакета: ', batchLoss:0:6);
      end
      else
      begin
        WriteLn('  Пропускаем пакет из-за ошибки');
      end;

    except
      on E: Exception do
      begin
        WriteLn('  ОШИБКА в пакете ', i + 1, ': ', E.Message);
        // Продолжаем со следующим пакетом
      end;
    end;
  end;

  if validBatches > 0 then
  begin
    Result := totalLoss / validBatches;
    WriteLn('Средние потери эпохи: ', Result:0:6);
  end
  else
  begin
    Result := MaxDouble;
    WriteLn('Не удалось обработать ни одного пакета');
  end;
end;

function FastCreateContextEnrichedInput(const InputText, Context: string): TDoubleMatrix;
var
  enrichedText: string;
begin
  // 🔥 БЫСТРАЯ КОМБИНАЦИЯ БЕЗ СЛОЖНОЙ ОБРАБОТКИ
  if Context.Trim <> '' then
    enrichedText := Copy(Context.Trim, 1, 150) + ' | ' + Copy(InputText.Trim, 1, 150)
  else
    enrichedText := Copy(InputText.Trim, 1, 200);

  // 🔥 ИСПОЛЬЗУЕМ ИНДЕКСЫ ВМЕСТО ПОЛНОГО ВЫЧИСЛЕНИЯ
  if Assigned(WordEmbeddings) then begin
    Result := FastTextToMatrixIndices(enrichedText, WordEmbeddings, 300);
  end else begin
    // Fallback на стандартный метод
    Result := TextsToMatrix([enrichedText], 300);
  end;

  WriteLn('FastCreateInput: ', Length(Result), 'x', Length(Result[0]));
end;

function FastCreateTargetMatrix(const ExpectedOutput: string): TDoubleMatrix;
begin
  // 🔥 ПРЯМОЕ ИСПОЛЬЗОВАНИЕ ИНДЕКСОВ ДЛЯ ТАРГЕТА
  if Assigned(WordEmbeddings) then begin
    Result := FastTextToMatrixIndices(Copy(ExpectedOutput.Trim, 1, 200), WordEmbeddings, 300);
  end else begin
    Result := TextsToMatrix([ExpectedOutput], 300);
  end;
end;

// В TrainerUnit.pas - быстрая версия с индексированием
{
procedure TrainTransformerWithIndexing(var Model: TTransformer; 
                                     const Dataset: TTrainingDataset; 
                                     Epochs: Integer; 
                                     LearningRate: Double);
var
  epoch, i: Integer;
  inputMatrix, targetMatrix, outputMatrix, gradOutput: TDoubleMatrix;
  loss, totalLoss: Double;
  validExamples: Integer;
  startTime: TDateTime;
begin
  WriteLn('🚀 ОБУЧЕНИЕ С ИНДЕКСАЦИЕЙ АКТИВИРОВАНО');
  WriteLn('Примеров: ', Length(Dataset), ', Эпох: ', Epochs);

  startTime := Now;

  for epoch := 1 to Epochs do begin
    totalLoss := 0;
    validExamples := 0;

    for i := 0 to High(Dataset) do begin
      try
        // 🔥 БЫСТРЫЙ ПРЕПРОЦЕССИНГ ЧЕРЕЗ ИНДЕКСЫ
        inputMatrix := FastCreateContextEnrichedInput(Dataset[i].Input, Dataset[i].Context);
        targetMatrix := FastCreateTargetMatrix(Dataset[i].ExpectedOutput);

        // Проверяем размерности
        if (Length(inputMatrix) = 0) or (Length(targetMatrix) = 0) then begin
          WriteLn('  Пропуск: пустая матрица');
          Continue;
        end;

        // Выравниваем размеры если нужно
        if Length(inputMatrix) <> Length(targetMatrix) then
          AdjustTargetSize(targetMatrix, Length(inputMatrix));

        // Стандартный прямой проход (оставляем как есть)
        ForwardTransformer(Model, inputMatrix, outputMatrix);

        // Вычисление потерь
        loss := CalculateLoss(outputMatrix, targetMatrix);
        totalLoss := totalLoss + loss;

        // Стандартный backward (оставляем как есть)
        gradOutput := CalculateGradient(outputMatrix, targetMatrix);
        BackwardTransformer(Model, inputMatrix, gradOutput);

        // Обновление весов
        UpdateTransformer(Model, LearningRate);

        Inc(validExamples);

        // Прогресс
        if (i > 0) and (i mod 5 = 0) then begin
          WriteLn('  ', i, '/', Length(Dataset), 
                  ' Loss: ', (totalLoss/validExamples):0:4,
                  ' Time: ', FormatDateTime('nn:ss', Now - startTime));
        end;

      except
        on E: Exception do begin
          WriteLn('  Пропуск примера ', i, ': ', E.Message);
        end;
      end;
    end;

    if validExamples > 0 then begin
      WriteLn('✅ Эпоха ', epoch, '/', Epochs, 
              ' | Loss: ', (totalLoss/validExamples):0:4,
              ' | Примеров: ', validExamples,
              ' | Время: ', FormatDateTime('nn:ss', Now - startTime));
    end;
  end;

  WriteLn('🎯 ОБУЧЕНИЕ ЗАВЕРШЕНО за ', FormatDateTime('nn:ss', Now - startTime));
end;
}
procedure TrainTransformerWithIndexing(var Model: TTransformer; 
                                     const Dataset: TTrainingDataset; 
                                     Epochs: Integer; 
                                     LearningRate: Double);
var
  epoch, i: Integer;
  inputMatrix, targetMatrix, outputMatrix, gradOutput: TDoubleMatrix;
  loss, totalLoss, avgLoss, gradNorm: Double;
  validExamples: Integer;
  startTime: TDateTime;
begin
  WriteLn('🚀 ЗАПУСК ОБУЧЕНИЯ ТРАНСФОРМЕРА');
  WriteLn('Примеров: ', Length(Dataset), ', Эпох: ', Epochs, ', Learning Rate: ', LearningRate:0:6);

  startTime := Now;

  for epoch := 1 to Epochs do begin
    totalLoss := 0;
    validExamples := 0;

    WriteLn('Эпоха ', epoch, '/', Epochs, ':');
    
    for i := 0 to High(Dataset) do begin
      try
        // Пропускаем пустые примеры
        if (Dataset[i].Input = '') or (Dataset[i].ExpectedOutput = '') then
          Continue;

        // 🔥 ОБУЧЕНИЕ С ИНДЕКСАЦИЕЙ
        inputMatrix := FastCreateContextEnrichedInput(Dataset[i].Input, Dataset[i].Context);
        targetMatrix := FastCreateTargetMatrix(Dataset[i].ExpectedOutput);

        // Проверяем размерности
        if (Length(inputMatrix) = 0) or (Length(targetMatrix) = 0) then begin
          WriteLn('  Пропуск примера ', i, ': пустая матрица');
          Continue;
        end;

        // Выравниваем размеры если нужно
        if Length(inputMatrix) <> Length(targetMatrix) then
          AdjustTargetSize(targetMatrix, Length(inputMatrix));

        // Прямой проход
        ForwardTransformer(Model, inputMatrix, outputMatrix);

        // Вычисление потерь
        loss := CalculateLoss(outputMatrix, targetMatrix);
        totalLoss := totalLoss + loss;

        // Обратный проход и обновление весов
        gradOutput := CalculateGradient(outputMatrix, targetMatrix);
        
        // 🔥 GRADIENT CLIPPING для стабильности
        gradNorm := MatrixNorm(gradOutput);
        if gradNorm > 1.0 then
        begin
          ScaleMatrix(gradOutput, 1.0 / gradNorm);
        end;
        
        BackwardTransformer(Model, inputMatrix, gradOutput);
        UpdateTransformer(Model, LearningRate);

        Inc(validExamples);

        // Прогресс каждые 10 примеров
        if (i > 0) and (i mod 10 = 0) then begin
          avgLoss := totalLoss / validExamples;
          WriteLn('  ', i, '/', Length(Dataset), 
                  ' Loss: ', avgLoss:0:4,
                  ' Time: ', FormatDateTime('nn:ss', Now - startTime));
        end;

      except
        on E: Exception do begin
          WriteLn('  Ошибка в примере ', i, ': ', E.Message);
        end;
      end;
    end;

    if validExamples > 0 then begin
      avgLoss := totalLoss / validExamples;
      WriteLn('✅ Эпоха ', epoch, ' завершена.');
      WriteLn('   Средние потери: ', avgLoss:0:4);
      WriteLn('   Обработано примеров: ', validExamples);
      WriteLn('   Время: ', FormatDateTime('nn:ss', Now - startTime));
    end else begin
      WriteLn('❌ Эпоха ', epoch, ': нет валидных примеров');
    end;
  end;

  WriteLn('🎯 ОБУЧЕНИЕ ЗАВЕРШЕНО за ', FormatDateTime('nn:ss', Now - startTime));
end;

// В TrainerUnit.pas добавим функцию проверки модели
procedure ValidateModel(var Model: TTransformer);
var i:Integer;
begin
  WriteLn('=== ПРОВЕРКА МОДЕЛИ ===');
  WriteLn('Config:');
  WriteLn('  InputSize: ', Model.Config.InputSize);
  WriteLn('  NumLayers: ', Model.Config.NumLayers);
  WriteLn('  NumHeads: ', Model.Config.NumHeads);
  WriteLn('  FFNDim: ', Model.Config.FFNDim);
  WriteLn('  MaxSeqLength: ', Model.Config.MaxSeqLength);
  WriteLn('  DropoutRate: ', Model.Config.DropoutRate:0:4);

  WriteLn('Embedding: ', Length(Model.Embedding), 'x', 
          IfThen(Length(Model.Embedding) > 0, IntToStr(Length(Model.Embedding[0])), '0'));

  WriteLn('Layers: ', Length(Model.Layers));
  for i := 0 to High(Model.Layers) do
  begin
    WriteLn('  Слой ', i, ':');
    WriteLn('    SelfAttention heads: ', Length(Model.Layers[i].SelfAttention.Heads));
    WriteLn('    FFN weights1: ', Length(Model.Layers[i].FFN_weights1), 'x', 
            IfThen(Length(Model.Layers[i].FFN_weights1) > 0, IntToStr(Length(Model.Layers[i].FFN_weights1[0])), '0'));
    WriteLn('    FFN weights2: ', Length(Model.Layers[i].FFN_weights2), 'x', 
            IfThen(Length(Model.Layers[i].FFN_weights2) > 0, IntToStr(Length(Model.Layers[i].FFN_weights2[0])), '0'));
  end;
end;

// В TrainerUnit.pas добавим функцию проверки модели
// В Transformer.pas упростим ValidateModelStructure
function ValidateModelStructure(var Model: TTransformer): Boolean;
begin
  Result := False;

  WriteLn('=== ПРОВЕРКА СТРУКТУРЫ МОДЕЛИ ===');
  WriteLn('Config:');
  WriteLn('  InputSize: ', Model.Config.InputSize);
  WriteLn('  NumLayers: ', Model.Config.NumLayers);
  WriteLn('  NumHeads: ', Model.Config.NumHeads);
  WriteLn('  FFNDim: ', Model.Config.FFNDim);
  WriteLn('  MaxSeqLength: ', Model.Config.MaxSeqLength);

  // ✅ БАЗОВАЯ ПРОВЕРКА КОНФИГУРАЦИИ
  if Model.Config.InputSize <= 0 then
  begin
    WriteLn('ОШИБКА: Model.Config.InputSize = ', Model.Config.InputSize);
    Exit;
  end;

  if Model.Config.NumLayers <= 0 then
  begin
    WriteLn('ОШИБКА: Model.Config.NumLayers = ', Model.Config.NumLayers);
    Exit;
  end;

  // ✅ ПРОВЕРКА EMBEDDING МАТРИЦЫ
  WriteLn('Embedding: ', Length(Model.Embedding), 'x', 
          IfThen(Length(Model.Embedding) > 0, IntToStr(Length(Model.Embedding[0])), '0'));

  if (Length(Model.Embedding) = 0) or (Length(Model.Embedding[0]) = 0) then
  begin
    WriteLn('ОШИБКА: Пустая embedding матрица');
    Exit;
  end;

  // ✅ ПРОВЕРКА КОЛИЧЕСТВА СЛОЕВ
  WriteLn('Layers: ', Length(Model.Layers));

  if Length(Model.Layers) <> Model.Config.NumLayers then
  begin
    WriteLn('ОШИБКА: Несовпадение количества слоев');
    WriteLn('  Ожидалось: ', Model.Config.NumLayers);
    WriteLn('  Фактически: ', Length(Model.Layers));
    Exit;
  end;

  WriteLn('Структура модели валидна');
  Result := True;
end;

end.