unit Transformer;
{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}{$ASMMODE INTEL}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  SysUtils, DataUtils, MatrixOps, Attention, PositionalEncoding, Optimizers,Math;

type
  TTransformerConfig = record
    InputSize, NumLayers, NumHeads, FFNDim, MaxSeqLength: Integer;
    DropoutRate, AttentionDropout, ResidualDropout, WeightDecay, GradientClipValue: Double;
    UseLayerNorm, UseStochasticDepth: Boolean;
  end;

  TTransformerLayer = record
    SelfAttention: TMultiHeadAttention;
    DropoutMask,
    cachedInput, // Вход в слой
    cachedOutput, // Выход из слоя
    ffnOutput, // Выход FFN
    norm1Output, // Выход LayerNorm1
    norm2Output, // Выход LayerNorm2
    FFN_weights1, // [input_size x ffn_dim]
    FFN_weights2, // [ffn_dim x input_size]
    // Градиенты
    FFN1_Grad, FFN2_Grad: TDoubleMatrix;
    // Состояния Adam
    FFN1_AdamState, FFN2_AdamState: TAdamState;
    // LayerNorm параметры
    Norm1_Gamma, Norm1_Beta, Norm2_Gamma, Norm2_Beta,
    // Градиенты LayerNorm
    Norm1_Gamma_Grad, Norm1_Beta_Grad, Norm2_Gamma_Grad, Norm2_Beta_Grad: TDoubleArray;
    // Состояния Adam для LayerNorm
    Norm1_Gamma_AdamState, Norm1_Beta_AdamState, Norm2_Gamma_AdamState, Norm2_Beta_AdamState: TAdamVectorState;
    DropoutMasks: TDoubleMatrixArray; // Для разных типов dropout
    StochasticDepthMask: TDoubleArray; // Для stochastic depth
Config:TTransformerConfig;
    // ✅ ДОБАВЛЯЕМ ПОЛЯ ДЛЯ КЭШИРОВАНИЯ ДЛЯ BACKWARD PASS
    cachedAttentionOutput, cachedNorm1Output, cachedFFNOutput: TDoubleMatrix;
    // ✅ ПОЛЯ ДЛЯ ГРАДИЕНТОВ
    FFN_weights1_Grad, FFN_weights2_Grad: TDoubleMatrix;
  end;

  TTransformer = record
    Layers: array of TTransformerLayer;
    PosEncoding, Embedding, Embedding_Grad: TDoubleMatrix;
    Embedding_AdamState: TAdamState;
    Config: TTransformerConfig;
    // поля для регуляризации
    PreNormGamma, PreNormBeta: TDoubleArray;
  end;

procedure InitTransformer(var Model: TTransformer; const Config: TTransformerConfig);
procedure FreeTransformer(var model: TTransformer);

function DetectExampleCount(const input: TDoubleMatrix): Integer;
function CombineExampleSequences(const examples: array of TDoubleMatrix): TDoubleMatrix;
procedure ForwardTransformer(var model: TTransformer; const input: TDoubleMatrix; out output: TDoubleMatrix; mask: TDoubleMatrix = nil; isTraining: Boolean = True);
procedure BackwardTransformer(var model: TTransformer; const input, gradOutput: TDoubleMatrix);
procedure UpdateTransformer(var model: TTransformer; learningRate: Double);
procedure EmbeddingBackward(var model: TTransformer; const gradOutput: TDoubleMatrix; const input: TDoubleMatrix);
function FFNBackward(var Layer: TTransformerLayer; const gradOutput: TDoubleMatrix; const ffnInput: TDoubleMatrix): TDoubleMatrix;
function LayerNormBackward(const gradOutput, normOutput, input: TDoubleMatrix; const gamma: TDoubleArray; out gradGamma, gradBeta: TDoubleArray): TDoubleMatrix;
procedure CheckModelDimensions(const Model: TTransformer);
procedure CheckAdamStates(const Model: TTransformer);
procedure ApplyWeightDecay(var Model: TTransformer; LearningRate: Double);
procedure ApplyL2ToMatrix(var Matrix: TDoubleMatrix; WeightDecay, LearningRate: Double);
procedure ApplyPreLayerNorm(var model: TTransformer; const input: TDoubleMatrix; out normalized: TDoubleMatrix);
procedure ApplyGradientClippingToModel(var Model: TTransformer; maxNorm: Double);
procedure ResetGradients(var Model: TTransformer);
procedure ValidateModel(var model: TTransformer);
procedure FastLayerNorm(var X: TDoubleMatrix; const Gamma, Beta: TDoubleArray);
function ProcessSequenceThroughLayersOptimized(var model: TTransformer; const sequence: TDoubleMatrix; mask: TDoubleMatrix; isTraining: Boolean): TDoubleMatrix;
function CreatePositionalEncoding(maxSeqLength, embeddingSize: Integer): TDoubleMatrix;
procedure ForwardAttentionOptimized(var Attention: TMultiHeadAttention; const Q, K, V: TDoubleMatrix; out Output: TDoubleMatrix; Mask: TDoubleMatrix; isTraining: Boolean);
procedure InitTransformerLayer(var Layer: TTransformerLayer; const Config: TTransformerConfig);
procedure BackwardLayer(var Layer: TTransformerLayer; const gradOutput: TDoubleMatrix; out gradInput: TDoubleMatrix);
procedure InitMultiHeadAttention(var Attention: TMultiHeadAttention; inputSize, headSize, numHeads: Integer);
procedure ForwardLayer(var Layer: TTransformerLayer; const Input: TDoubleMatrix; out Output: TDoubleMatrix; Mask: TDoubleMatrix; isTraining: Boolean);
function ValidateModelStructure(const Model: TTransformer): Boolean;
procedure InitializeAllGradients(var Model: TTransformer);
function ApplyOutputProjection(const concatOutput: TDoubleMatrix; const Heads: array of TAttentionHead): TDoubleMatrix;

var
  TransformerModel: TTransformer;
  IsTransformerInitialized: Boolean = False;

implementation

uses TrainerUnit;

{$I asmf.inc}

procedure InitTransformer(var Model: TTransformer; const Config: TTransformerConfig);
var
  i: Integer;
  safeConfig: TTransformerConfig;
begin
  WriteLn('InitTransformer: начат');
  // ✅ ИСПРАВЛЕНИЕ: СОЗДАЕМ БЕЗОПАСНУЮ КОНФИГУРАЦИЮ
  safeConfig := Config;
  // Гарантируем минимальные значения
  if safeConfig.InputSize <= 0 then begin
    WriteLn('ПРЕДУПРЕЖДЕНИЕ: InputSize = ', safeConfig.InputSize, ', устанавливаем 300');
    safeConfig.InputSize := 300;
  end;
  if safeConfig.NumLayers <= 0 then begin
    WriteLn('ПРЕДУПРЕЖДЕНИЕ: NumLayers = ', safeConfig.NumLayers, ', устанавливаем 2');
    safeConfig.NumLayers := 2;
  end;
  if safeConfig.NumHeads <= 0 then begin
    WriteLn('ПРЕДУПРЕЖДЕНИЕ: NumHeads = ', safeConfig.NumHeads, ', устанавливаем 4');
    safeConfig.NumHeads := 4;
  end;
  if safeConfig.FFNDim <= 0 then begin
    WriteLn('ПРЕДУПРЕЖДЕНИЕ: FFNDim = ', safeConfig.FFNDim, ', устанавливаем 512');
    safeConfig.FFNDim := 512;
  end;
  if safeConfig.MaxSeqLength <= 0 then begin
    WriteLn('ПРЕДУПРЕЖДЕНИЕ: MaxSeqLength = ', safeConfig.MaxSeqLength, ', устанавливаем 1000');
    safeConfig.MaxSeqLength := 1000;
  end;
  Model.Config := safeConfig;
  WriteLn('  Конфигурация модели:');
  WriteLn('    InputSize: ', Model.Config.InputSize);
  WriteLn('    NumLayers: ', Model.Config.NumLayers);
  WriteLn('    NumHeads: ', Model.Config.NumHeads);
  WriteLn('    FFNDim: ', Model.Config.FFNDim);
  WriteLn('    MaxSeqLength: ', Model.Config.MaxSeqLength);
  // Инициализация embedding матрицы
  WriteLn('  Инициализация embedding матрицы...');
  Model.Embedding := CreateRandomMatrix(Model.Config.InputSize, Model.Config.InputSize, -0.1, 0.1);
  WriteLn('    Embedding: ', Length(Model.Embedding), 'x', Length(Model.Embedding[0]));
  // ✅ ИНИЦИАЛИЗИРУЕМ ГРАДИЕНТЫ EMBEDDING
  Model.Embedding_Grad := CreateZeroMatrix(Model.Config.InputSize, Model.Config.InputSize);
  WriteLn('    Embedding_Grad: ', Length(Model.Embedding_Grad), 'x', Length(Model.Embedding_Grad[0]));
  // Инициализация слоев
  WriteLn('  Инициализация ', Model.Config.NumLayers, ' слоев...');
  SetLength(Model.Layers, Model.Config.NumLayers);
  for i := 0 to Model.Config.NumLayers - 1 do begin
    WriteLn('    Слой ', i, ':');
    InitTransformerLayer(Model.Layers[i], Model.Config);
  end;
  WriteLn('InitTransformer: завершен успешно');
end;

procedure FreeTransformer(var model: TTransformer);
var
  i: Integer;
begin
  WriteLn('Debug: FreeTransformer called'); // Добавьте эту строку
  // Освобождение памяти
  for i := 0 to High(model.Layers) do begin
    FreeMultiHeadAttention(model.Layers[i].SelfAttention);
    SetLength(model.Layers[i].Norm1_Gamma, 0);
    SetLength(model.Layers[i].Norm1_Beta, 0);
    SetLength(model.Layers[i].Norm2_Gamma, 0);
    SetLength(model.Layers[i].Norm2_Beta, 0);
  end;
  SetLength(model.Layers, 0);
  SetLength(model.Embedding, 0);
  SetLength(model.PosEncoding, 0);
end;

function ApplyEmbedding(const Model: TTransformer; const Input: TDoubleMatrix): TDoubleMatrix;
var i,j,k: Integer;
begin
  WriteLn('  ApplyEmbedding: начат');
  WriteLn('    Input: ', Length(Input), 'x', IfThen(Length(Input) > 0, IntToStr(Length(Input[0])), '0'));

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(Input) = 0) or (Length(Input[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустой вход в ApplyEmbedding');
Halt;
    SetLength(Result, 0, 0);
    Exit;
  end;

  // ✅ ЗАЩИТА: Проверяем embedding матрицу
  if (Length(Model.Embedding) = 0) or (Length(Model.Embedding[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустая embedding матрица');
    WriteLn('    Model.Embedding: ', Length(Model.Embedding), 'x', IfThen(Length(Model.Embedding) > 0, IntToStr(Length(Model.Embedding[0])), '0'));
Halt;
    SetLength(Result, 0, 0);
    Exit;
  end;

  // ✅ ЗАЩИТА: Проверяем совместимость размеров
  if Length(Input[0]) <> Length(Model.Embedding) then begin
    WriteLn('ОШИБКА: Несовпадение размеров в ApplyEmbedding');
    WriteLn('    Input cols: ', Length(Input[0]));
    WriteLn('    Embedding rows: ', Length(Model.Embedding));
Halt;
    SetLength(Result, 0, 0);
    Exit;
  end;

  SetLength(Result, Length(Input), Length(Model.Embedding[0]));
  WriteLn('    Result размер: ', Length(Result), 'x', Length(Result[0]));

  try
    for i := 0 to High(Input) do begin
      // ✅ ЗАЩИТА: Проверяем строку Input
      if i >= Length(Input) then Break;

      for j := 0 to High(Result[i]) do begin
        Result[i][j] := 0.0;
        for k := 0 to Min(High(Input[i]), High(Model.Embedding)) do begin
          // ✅ ЗАЩИТА: Проверяем все индексы
          if (k < Length(Input[i])) and (k < Length(Model.Embedding)) and 
             (j < Length(Model.Embedding[k])) then begin
            Result[i][j] := Result[i][j] + Input[i][k] * Model.Embedding[k][j];
          end;
        end;
      end;
    end;

    WriteLn('    ApplyEmbedding: завершен успешно');
    WriteLn('    Result: ', Length(Result), 'x', Length(Result[0]));

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА в ApplyEmbedding: ', E.Message);
Halt;
      SetLength(Result, 0, 0);
    end;
  end;
end;

procedure ForwardAttentionHeadOptimized(var Head: TAttentionHead; const Q, K, V: TDoubleMatrix; out Output: TDoubleMatrix; Mask: TDoubleMatrix; isTraining: Boolean);
var
  Q_proj, K_proj, V_proj, scores, attentionWeights: TDoubleMatrix;
  scaleFactor: Double;
i,j:Integer;
begin
  WriteLn('            ForwardAttentionHeadOptimized: начат');

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(Q) = 0) or (Length(K) = 0) or (Length(V) = 0) then begin
    WriteLn('ОШИБКА: Пустые входы в ForwardAttentionHeadOptimized');
Halt;
    SetLength(Output, 0, 0);
    Exit;
  end;

  try
    // Линейные проекции
    WriteLn('            Линейные проекции...');
    if (Length(Q) > 0) and (Length(Head.Wq) > 0) then
      Q_proj := MatrixMultiply(Q, Head.Wq)
    else
      Q_proj := CopyMatrix(Q);

    if (Length(K) > 0) and (Length(Head.Wk) > 0) then
      K_proj := MatrixMultiply(K, Head.Wk)
    else
      K_proj := CopyMatrix(K);

    if (Length(V) > 0) and (Length(Head.Wv) > 0) then
      V_proj := MatrixMultiply(V, Head.Wv)
    else
      V_proj := CopyMatrix(V);

    WriteLn('              Q_proj: ', Length(Q_proj), 'x', Length(Q_proj[0]));
    WriteLn('              K_proj: ', Length(K_proj), 'x', Length(K_proj[0]));
    WriteLn('              V_proj: ', Length(V_proj), 'x', Length(V_proj[0]));

    // Scaled Dot-Product Attention
    WriteLn('            Scaled Dot-Product Attention...');
    scores := MatrixMultiply(Q_proj, TransposeMatrix(K_proj));
    WriteLn('              scores: ', Length(scores), 'x', Length(scores[0]));

    // Масштабирование
    if (Length(scores) > 0) and (Length(scores[0]) > 0) then begin
      scaleFactor := 1.0 / Sqrt(Length(K_proj[0]));
      ScaleMatrix(scores, scaleFactor);
    end;

    // Применяем маску если есть
    if (Length(Mask) > 0) and (Length(Mask) = Length(scores)) and (Length(Mask[0]) = Length(scores[0])) then begin
      for i := 0 to High(scores) do
        for j := 0 to High(scores[i]) do
          if Mask[i][j] = 0 then
            scores[i][j] := -1e9;
    end;

    // Softmax
    WriteLn('            Softmax...');
    attentionWeights := Softmax(scores);
    WriteLn('              attentionWeights: ', Length(attentionWeights), 'x', Length(attentionWeights[0]));

    // Применяем attention weights к values
    WriteLn('            Применение весов...');
    if (Length(attentionWeights) > 0) and (Length(V_proj) > 0) then
      Output := MatrixMultiply(attentionWeights, V_proj)
    else
      Output := CopyMatrix(attentionWeights);

    WriteLn('              Output: ', Length(Output), 'x', Length(Output[0]));
    WriteLn('            ForwardAttentionHeadOptimized: завершен успешно');

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА в ForwardAttentionHeadOptimized: ', E.Message);
Halt;
      // Возвращаем Q как fallback
      Output := CopyMatrix(Q);
    end;
  end;
end;

procedure ForwardFFNOptimized(const Layer: TTransformerLayer; const Input: TDoubleMatrix; out Output: TDoubleMatrix; isTraining: Boolean);
var hidden: TDoubleMatrix;
begin
  WriteLn('        ForwardFFNOptimized: начат');
  WriteLn('          Input: ', Length(Input), 'x', Length(Input[0]));

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(Input) = 0) or (Length(Input[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустой вход в ForwardFFNOptimized');
    SetLength(Output, 0, 0);
    Exit;
  end;

  try
    // Первый линейный слой + ReLU
    WriteLn('          Первый линейный слой...');
    if (Length(Input) > 0) and (Length(Layer.FFN_weights1) > 0) then
      hidden := MatrixMultiply(Input, Layer.FFN_weights1)
    else
      hidden := CopyMatrix(Input);

    WriteLn('            hidden до ReLU: ', Length(hidden), 'x', Length(hidden[0]));
    hidden := ReLU(hidden);
    WriteLn('            hidden после ReLU: ', Length(hidden), 'x', Length(hidden[0]));

    // Dropout во время обучения
    if isTraining and (Layer.Config.DropoutRate > 0) then begin
      WriteLn('          Применение dropout...');
      hidden := Dropout(hidden, Layer.Config.DropoutRate);
    end;

    // Второй линейный слой
    WriteLn('          Второй линейный слой...');
    if (Length(hidden) > 0) and (Length(Layer.FFN_weights2) > 0) then
      Output := MatrixMultiply(hidden, Layer.FFN_weights2)
    else
      Output := CopyMatrix(hidden);

    WriteLn('          Output: ', Length(Output), 'x', Length(Output[0]));
    WriteLn('        ForwardFFNOptimized: завершен успешно');

  except
    on E: Exception do begin
      WriteLn('ОШИБКА в ForwardFFNOptimized: ', E.Message);
Halt;
      // Возвращаем вход как fallback
      Output := CopyMatrix(Input);
    end;
  end;
end;

procedure ForwardLayerOptimizedSimple(var Layer: TTransformerLayer; const Input: TDoubleMatrix; out Output: TDoubleMatrix);
begin
  WriteLn('    ForwardLayerOptimizedSimple: начат');
  WriteLn('      Input: ', Length(Input), 'x', IfThen(Length(Input) > 0, IntToStr(Length(Input[0])), '0'));
  // ✅ ПРОСТЕЙШАЯ РЕАЛИЗАЦИЯ: Просто копируем вход
  Output := CopyMatrix(Input);
  WriteLn('    ForwardLayerOptimizedSimple: завершен');
  WriteLn('      Output: ', Length(Output), 'x', IfThen(Length(Output) > 0, IntToStr(Length(Output[0])), '0'));
end;

procedure ForwardLayerOptimized(var Layer: TTransformerLayer; const Input: TDoubleMatrix; out Output: TDoubleMatrix; Mask: TDoubleMatrix; isTraining: Boolean);
var
  attentionOutput, ffnOutput, norm1Output, norm2Output: TDoubleMatrix;
  residual: TDoubleMatrix;
begin
  WriteLn('    ForwardLayerOptimized: начат');

  try
    // 1. Self-Attention с residual connection и layer norm
    WriteLn('      Self-Attention...');
    SetLength(attentionOutput, 0, 0);
    ForwardAttentionOptimized(Layer.SelfAttention, Input, Input, Input, attentionOutput, Mask, isTraining);

    // ✅ ИСПРАВЛЕНИЕ: КЭШИРУЕМ ВЫХОДЫ ДЛЯ BACKWARD PASS
    if isTraining then begin
      Layer.cachedInput := CopyMatrix(Input);
      Layer.cachedAttentionOutput := CopyMatrix(attentionOutput);
    end;

    // Residual connection 1
    if (Length(attentionOutput) = Length(Input)) and (Length(attentionOutput[0]) = Length(Input[0])) then
      residual := AddMatrices(Input, attentionOutput)
    else
      residual := CopyMatrix(attentionOutput);

    // LayerNorm 1
    norm1Output := LayerNorm(residual, Layer.Norm1_Gamma, Layer.Norm1_Beta);

    // 2. Feed Forward Network
    WriteLn('      FFN...');
    SetLength(ffnOutput, 0, 0);
    ForwardFFNOptimized(Layer, norm1Output, ffnOutput, isTraining);

    // ✅ ИСПРАВЛЕНИЕ: КЭШИРУЕМ ДЛЯ BACKWARD PASS
    if isTraining then begin
      Layer.cachedNorm1Output := CopyMatrix(norm1Output);
      Layer.cachedFFNOutput := CopyMatrix(ffnOutput);
    end;

    // Residual connection 2
    if (Length(ffnOutput) = Length(norm1Output)) and (Length(ffnOutput[0]) = Length(norm1Output[0])) then
      residual := AddMatrices(norm1Output, ffnOutput)
    else
      residual := CopyMatrix(ffnOutput);

    // LayerNorm 2
    Output := LayerNorm(residual, Layer.Norm2_Gamma, Layer.Norm2_Beta);

    // ✅ ИСПРАВЛЕНИЕ: КЭШИРУЕМ ФИНАЛЬНЫЙ ВЫХОД
    if isTraining then Layer.cachedOutput := CopyMatrix(Output);

    WriteLn('    ForwardLayerOptimized: завершен успешно');

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА в ForwardLayerOptimized: ', E.Message);
Halt;
      Output := CopyMatrix(Input);
    end;
  end;
end;

procedure ForwardAttentionOptimized(var Attention: TMultiHeadAttention; const Q, K, V: TDoubleMatrix; out Output: TDoubleMatrix; Mask: TDoubleMatrix; isTraining: Boolean);
var
  i,j: Integer;
  headOutputs: array of TDoubleMatrix;
  headOutput, concatOutput: TDoubleMatrix;
begin
  WriteLn('        ForwardAttentionOptimized: начат');
  WriteLn('          Q: ', Length(Q), 'x', IfThen(Length(Q) > 0, IntToStr(Length(Q[0])), '0'));
  WriteLn('          K: ', Length(K), 'x', IfThen(Length(K) > 0, IntToStr(Length(K[0])), '0'));
  WriteLn('          V: ', Length(V), 'x', IfThen(Length(V) > 0, IntToStr(Length(V[0])), '0'));

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(Q) = 0) or (Length(K) = 0) or (Length(V) = 0) then begin
    WriteLn('ОШИБКА: Пустые входы в ForwardAttentionOptimized');
Halt;
    SetLength(Output, 0, 0);
    Exit;
  end;

  try
    SetLength(headOutputs, Length(Attention.Heads));

    // Обрабатываем каждую голову внимания
    for i := 0 to High(Attention.Heads) do begin
      WriteLn('          Голова ', i, ':');
      SetLength(headOutput, 0, 0);
      ForwardAttentionHeadOptimized(Attention.Heads[i], Q, K, V, headOutput, Mask, isTraining);

      // ✅ ЗАЩИТА: Проверяем вывод головы
      if (Length(headOutput) = 0) or (Length(headOutput[0]) = 0) then begin
        WriteLn('          ОШИБКА: Пустой вывод головы ', i);
        // Используем Q как fallback
        headOutputs[i] := CopyMatrix(Q);
      end else begin
        headOutputs[i] := headOutput;
      end;

      WriteLn('            headOutput: ', Length(headOutputs[i]), 'x', IfThen(Length(headOutputs[i]) > 0, IntToStr(Length(headOutputs[i][0])), '0'));
    end;

    // Конкатенируем выходы всех голов
    WriteLn('          Конкатенация голов...');
    concatOutput := ConcatMatrices(headOutputs);
    WriteLn('            concatOutput: ', Length(concatOutput), 'x', IfThen(Length(concatOutput) > 0, IntToStr(Length(concatOutput[0])), '0'));

    // ✅ ЗАЩИТА: Проверяем concatOutput
    if (Length(concatOutput) = 0) or (Length(concatOutput[0]) = 0) then begin
      WriteLn('          ОШИБКА: Пустой concatOutput');
Halt;
      Output := CopyMatrix(Q);
      Exit;
    end;

    // Применяем выходную проекцию
WriteLn('        Выходная проекция...');
WriteLn('          concatOutput: ', Length(concatOutput), 'x', Length(concatOutput[0]));
WriteLn('          Wo: ', Length(Attention.Heads[0].Wo), 'x', Length(Attention.Heads[0].Wo[0]));

Output := ApplyOutputProjection(concatOutput, Attention.Heads);

    WriteLn('          Output: ', Length(Output), 'x', IfThen(Length(Output) > 0, IntToStr(Length(Output[0])), '0'));
    WriteLn('        ForwardAttentionOptimized: завершен успешно');

  except
    on E: Exception do begin
      WriteLn('ОШИБКА в ForwardAttentionOptimized: ', E.Message);
      WriteLn('  Тип ошибки: ', E.ClassName);
Halt;
      // Возвращаем Q как fallback
      Output := CopyMatrix(Q);
    end;
  end;
end;

// Альтернативная версия без var параметра
function AddPositionalEncodingToSequenceSafe(const sequence: TDoubleMatrix; maxSeqLength: Integer): TDoubleMatrix;
var
  i, j, actualMaxLength: Integer;
  posEnc: TDoubleMatrix;
begin
  WriteLn('      AddPositionalEncodingToSequenceSafe: начат');
  WriteLn('        sequence: ', Length(sequence), 'x', IfThen(Length(sequence) > 0, IntToStr(Length(sequence[0])), '0'));

  // Создаем копию входной последовательности
  Result := CopyMatrix(sequence);

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(Result) = 0) or (Length(Result[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустая sequence в AddPositionalEncodingToSequenceSafe');
Halt;
    Exit;
  end;

  // ✅ ИСПРАВЛЕНИЕ: Убедимся, что maxSeqLength корректен
  if maxSeqLength <= 0 then begin
    actualMaxLength := Length(Result);
    WriteLn('        Исправляем maxSeqLength с ', maxSeqLength, ' на ', actualMaxLength);
  end else
    actualMaxLength := Min(maxSeqLength, Length(Result));

  WriteLn('        actualMaxLength: ', actualMaxLength);

  // Создаем позиционное кодирование
  posEnc := CreatePositionalEncoding(actualMaxLength, Length(Result[0]));
  WriteLn('        posEnc создан: ', Length(posEnc), 'x', IfThen(Length(posEnc) > 0, IntToStr(Length(posEnc[0])), '0'));

  // ✅ ЗАЩИТА: Проверяем размерности
  if (Length(posEnc) = 0) or (Length(posEnc[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустое позиционное кодирование');
Halt;
    Exit;
  end;

  if (Length(Result) < Length(posEnc)) or (Length(Result[0]) <> Length(posEnc[0])) then begin
    WriteLn('ОШИБКА: Несовпадение размеров sequence и posEnc');
    WriteLn('        sequence: ', Length(Result), 'x', Length(Result[0]));
    WriteLn('        posEnc: ', Length(posEnc), 'x', Length(posEnc[0]));
    Exit;
  end;

  // Применяем позиционное кодирование
  for i := 0 to actualMaxLength - 1 do begin
    if (i < Length(Result)) and (i < Length(posEnc)) then begin
      for j := 0 to High(Result[i]) do begin
        if j < Length(posEnc[i]) then begin
          Result[i][j] := Result[i][j] + posEnc[i][j];
        end;
      end;
    end;
  end;

  WriteLn('        Result: ', Length(Result), 'x', Length(Result[0]));
  WriteLn('      AddPositionalEncodingToSequenceSafe: завершен');
end;

// Улучшенная отладочная версия
procedure DebugForwardLayerOptimizedSimple(var Layer: TTransformerLayer; const Input: TDoubleMatrix; out Output: TDoubleMatrix);
var i,j:Integer;
begin
  WriteLn('    DebugForwardLayerOptimizedSimple: начат');
  WriteLn('      Input указатель в функции: ', PtrUInt(@Input));

  // ✅ УГЛУБЛЕННАЯ ПРОВЕРКА ВХОДНЫХ ДАННЫХ
  WriteLn('      Проверка Input...');
  WriteLn('        Input длина: ', Length(Input));

  if Length(Input) = 0 then begin
    WriteLn('      КРИТИЧЕСКАЯ ОШИБКА: Input имеет длину 0 внутри функции');
    WriteLn('      Это означает, что массив теряется при передаче в функцию!');
Halt;
    SetLength(Output, 0, 0);
    Exit;
  end;

  WriteLn('        Input[0] длина: ', Length(Input[0]));

  if Length(Input[0]) = 0 then begin
    WriteLn('      ОШИБКА: Input[0] имеет длину 0');
Halt;
    SetLength(Output, 0, 0);
    Exit;
  end;

  WriteLn('      Input: ', Length(Input), 'x', Length(Input[0]));

  try
    // ✅ БЕЗОПАСНОЕ КОПИРОВАНИЕ С ПРОВЕРКОЙ
    WriteLn('      Копирование матрицы...');

    // Создаем выходную матрицу того же размера
    SetLength(Output, Length(Input), Length(Input[0]));
    WriteLn('      Output выделен: ', Length(Output), 'x', Length(Output[0]));

    // Копируем данные
    for i := 0 to High(Input) do begin
      for j := 0 to High(Input[i]) do begin
        Output[i][j] := Input[i][j];
      end;
    end;

    WriteLn('      Копирование завершено успешно');
    WriteLn('      Output: ', Length(Output), 'x', Length(Output[0]));

    WriteLn('    DebugForwardLayerOptimizedSimple: завершен успешно');

  except
    on E: Exception do begin
      WriteLn('      ОШИБКА в DebugForwardLayerOptimizedSimple: ', E.Message);
      WriteLn('        Тип ошибки: ', E.ClassName);
      SetLength(Output, 0, 0);
    end;
  end;
end;

procedure ForwardLayer(var Layer: TTransformerLayer; const Input: TDoubleMatrix; out Output: TDoubleMatrix; Mask: TDoubleMatrix; isTraining: Boolean);
var
  attentionOutput, ffnOutput, norm1Output, norm2Output: TDoubleMatrix;
  residual: TDoubleMatrix;
begin
  WriteLn('    ForwardLayerOptimized: начат');
  WriteLn('      Input: ', Length(Input), 'x', IfThen(Length(Input) > 0, IntToStr(Length(Input[0])), '0'));

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(Input) = 0) or (Length(Input[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустой вход в ForwardLayerOptimized');
Halt;
    SetLength(Output, 0, 0);
    Exit;
  end;

  try
    // 1. Self-Attention с residual connection и layer norm
    WriteLn('      Self-Attention...');
    SetLength(attentionOutput, 0, 0);
    ForwardAttentionOptimized(Layer.SelfAttention, Input, Input, Input, attentionOutput, Mask, isTraining);
    WriteLn('        attentionOutput: ', Length(attentionOutput), 'x', IfThen(Length(attentionOutput) > 0, IntToStr(Length(attentionOutput[0])), '0'));

    // ✅ ЗАЩИТА: Проверяем attentionOutput
    if (Length(attentionOutput) = 0) or (Length(attentionOutput[0]) = 0) then begin
      WriteLn('ОШИБКА: Пустой attentionOutput');
Halt;
      Output := CopyMatrix(Input);
      Exit;
    end;

    // ✅ ИСПРАВЛЕНИЕ: КЭШИРУЕМ ВЫХОДЫ ДЛЯ BACKWARD PASS
    if isTraining then begin
      Layer.cachedInput := CopyMatrix(Input);
      Layer.cachedAttentionOutput := CopyMatrix(attentionOutput);
    end;

    // Residual connection 1
    WriteLn('      Residual connection 1...');
    if (Length(attentionOutput) = Length(Input)) and (Length(attentionOutput[0]) = Length(Input[0])) then begin
      residual := AddMatrices(Input, attentionOutput);
    end else begin
      WriteLn('      Предупреждение: размеры не совпадают для residual connection 1');
      WriteLn('        Input: ', Length(Input), 'x', Length(Input[0]));
      WriteLn('        attentionOutput: ', Length(attentionOutput), 'x', Length(attentionOutput[0]));
      residual := CopyMatrix(attentionOutput);
    end;

    WriteLn('        residual: ', Length(residual), 'x', Length(residual[0]));

    // LayerNorm 1
    WriteLn('      LayerNorm 1...');
    norm1Output := LayerNorm(residual, Layer.Norm1_Gamma, Layer.Norm1_Beta);
    WriteLn('        norm1Output: ', Length(norm1Output), 'x', IfThen(Length(norm1Output) > 0, IntToStr(Length(norm1Output[0])), '0'));

    // ✅ КЭШИРУЕМ ДЛЯ BACKWARD PASS
    if isTraining then Layer.cachedNorm1Output := CopyMatrix(norm1Output);

    // 2. Feed Forward Network с residual connection и layer norm
    WriteLn('      FFN...');
    SetLength(ffnOutput, 0, 0);
    ForwardFFNOptimized(Layer, norm1Output, ffnOutput, isTraining);
    WriteLn('        ffnOutput: ', Length(ffnOutput), 'x', IfThen(Length(ffnOutput) > 0, IntToStr(Length(ffnOutput[0])), '0'));

    // ✅ ЗАЩИТА: Проверяем ffnOutput
    if (Length(ffnOutput) = 0) or (Length(ffnOutput[0]) = 0) then begin
      WriteLn('ОШИБКА: Пустой ffnOutput');
Halt;
      Output := CopyMatrix(norm1Output);
      Exit;
    end;

    // ✅ КЭШИРУЕМ ДЛЯ BACKWARD PASS
    if isTraining then Layer.cachedFFNOutput := CopyMatrix(ffnOutput);

    // Residual connection 2
    WriteLn('      Residual connection 2...');
    if (Length(ffnOutput) = Length(norm1Output)) and (Length(ffnOutput[0]) = Length(norm1Output[0])) then begin
      residual := AddMatrices(norm1Output, ffnOutput);
    end else begin
      WriteLn('      Предупреждение: размеры не совпадают для residual connection 2');
      WriteLn('        norm1Output: ', Length(norm1Output), 'x', Length(norm1Output[0]));
      WriteLn('        ffnOutput: ', Length(ffnOutput), 'x', Length(ffnOutput[0]));
      residual := CopyMatrix(ffnOutput);
    end;

    WriteLn('        residual: ', Length(residual), 'x', Length(residual[0]));

    // LayerNorm 2
    WriteLn('      LayerNorm 2...');
    Output := LayerNorm(residual, Layer.Norm2_Gamma, Layer.Norm2_Beta);
    WriteLn('        Output: ', Length(Output), 'x', 
            IfThen(Length(Output) > 0, IntToStr(Length(Output[0])), '0'));

    // ✅ КЭШИРУЕМ ФИНАЛЬНЫЙ ВЫХОД
    if isTraining then Layer.cachedOutput := CopyMatrix(Output);

    WriteLn('    ForwardLayerOptimized: завершен успешно');

  except
    on E: Exception do begin
      WriteLn('ОШИБКА в ForwardLayerOptimized: ', E.Message);
      WriteLn('  Тип ошибки: ', E.ClassName);
Halt;
      // Возвращаем вход как fallback
      Output := CopyMatrix(Input);
    end;
  end;
end;

function DetectExampleCount(const input: TDoubleMatrix): Integer;
begin
  // Простая эвристика: предполагаем, что примеры имеют одинаковую длину
  // В реальности нужно будет улучшить эту логику
  if Length(input) >= 10 then
    Result := 2 // Для начала предположим 2 примера
  else
    Result := 1;
end;

function ProcessSequenceThroughLayersOptimized(var model: TTransformer; const sequence: TDoubleMatrix; mask: TDoubleMatrix; isTraining: Boolean): TDoubleMatrix;
var
  layer: Integer;
  x, attnOutput, ffnOutput, residual: TDoubleMatrix;
begin
  // Используем одну матрицу для всего процесса
  x := CopyMatrix(sequence); // Только одно копирование на всю последовательность

  for layer := 0 to High(model.Layers) do begin
    try
      // 1. Self-Attention с residual connection
      residual := CopyMatrix(x); // Сохраняем для residual

      // In-place attention вычисления
      MultiHeadAttentionForward(model.Layers[layer].SelfAttention, x, attnOutput, mask);

      // In-place сложение вместо создания новой матрицы
      MatrixAddInPlace(x, attnOutput);

      // 2. LayerNorm in-place
      FastLayerNorm(x, model.Layers[layer].Norm1_Gamma, model.Layers[layer].Norm1_Beta);

      // Сохраняем для backward (только если нужно)
      if isTraining then
        model.Layers[layer].norm1Output := CopyMatrix(x);

      // 3. FFN с residual connection  
      residual := CopyMatrix(x); // Сохраняем для второго residual

      // FFN in-place
      ffnOutput := MatrixMultiplyFast(x, model.Layers[layer].FFN_weights1);
      ffnOutput := ReLU(ffnOutput);
      ffnOutput := MatrixMultiplyFast(ffnOutput, model.Layers[layer].FFN_weights2);

      // In-place сложение
      MatrixAddInPlace(x, ffnOutput);

      // 4. LayerNorm in-place
      FastLayerNorm(x, model.Layers[layer].Norm2_Gamma, model.Layers[layer].Norm2_Beta);

      if isTraining then model.Layers[layer].norm2Output := CopyMatrix(x);

    except
      on E: Exception do begin
        WriteLn('Ошибка в слое ', layer, ': ', E.Message);
        // Продолжаем со следующими слоями
      end;
    end;
  end;

  Exit(x); // Возвращаем измененную матрицу
end;

function CombineExampleSequences(const examples: array of TDoubleMatrix): TDoubleMatrix;
var i,j,k,totalRows,cols: Integer;
begin
  if Length(examples) = 0 then begin
    SetLength(Result, 0, 0);
    Exit;
  end;

  totalRows := 0;
  cols := Length(examples[0][0]);

  for i := 0 to High(examples) do Inc(totalRows, Length(examples[i]));

  SetLength(Result, totalRows, cols);

  k := 0;
  for i := 0 to High(examples) do begin
    for j := 0 to High(examples[i]) do begin
      if k < totalRows then
        Result[k] := Copy(examples[i][j], 0, cols);
      Inc(k);
    end;
  end;
end;

procedure InitializeGradients(var model: TTransformer);
var
  i, j: Integer;
begin
  WriteLn('InitializeGradients: инициализация всех градиентов');

  // Инициализация градиентов эмбеддингов
  if Length(model.Embedding_Grad) = 0 then begin
    SetLength(model.Embedding_Grad, Length(model.Embedding), Length(model.Embedding[0]));
    FillMatrix(model.Embedding_Grad, 0.0);
  end;

  // Инициализация градиентов всех слоев
  for i := 0 to High(model.Layers) do begin
    // Градиенты FFN
    if Length(model.Layers[i].FFN1_Grad) = 0 then begin
      SetLength(model.Layers[i].FFN1_Grad, Length(model.Layers[i].FFN_weights1), Length(model.Layers[i].FFN_weights1[0]));
      FillMatrix(model.Layers[i].FFN1_Grad, 0.0);
    end;

    if Length(model.Layers[i].FFN2_Grad) = 0 then begin
      SetLength(model.Layers[i].FFN2_Grad, Length(model.Layers[i].FFN_weights2), Length(model.Layers[i].FFN_weights2[0]));
      FillMatrix(model.Layers[i].FFN2_Grad, 0.0);
    end;

    // Градиенты LayerNorm
    if Length(model.Layers[i].Norm1_Gamma_Grad) = 0 then begin
      SetLength(model.Layers[i].Norm1_Gamma_Grad, Length(model.Layers[i].Norm1_Gamma));
      FillArray(model.Layers[i].Norm1_Gamma_Grad, 0.0);
    end;

    if Length(model.Layers[i].Norm1_Beta_Grad) = 0 then begin
      SetLength(model.Layers[i].Norm1_Beta_Grad, Length(model.Layers[i].Norm1_Beta));
      FillArray(model.Layers[i].Norm1_Beta_Grad, 0.0);
    end;

    if Length(model.Layers[i].Norm2_Gamma_Grad) = 0 then begin
      SetLength(model.Layers[i].Norm2_Gamma_Grad, Length(model.Layers[i].Norm2_Gamma));
      FillArray(model.Layers[i].Norm2_Gamma_Grad, 0.0);
    end;

    if Length(model.Layers[i].Norm2_Beta_Grad) = 0 then begin
      SetLength(model.Layers[i].Norm2_Beta_Grad, Length(model.Layers[i].Norm2_Beta));
      FillArray(model.Layers[i].Norm2_Beta_Grad, 0.0);
    end;

    // Градиенты Attention
    for j := 0 to High(model.Layers[i].SelfAttention.Heads) do begin
      with model.Layers[i].SelfAttention.Heads[j] do begin
      if Length(dWq) = 0 then begin
        SetLength(dWq, Length(Wq), Length(Wq[0]));
        FillMatrix(dWq, 0.0);
      end;
      if Length(dWk) = 0 then begin
        SetLength(dWk, Length(Wk), Length(Wk[0]));
        FillMatrix(dWk, 0.0);
      end;
      if Length(dWv) = 0 then begin
        SetLength(dWv, Length(Wv), Length(Wv[0]));
        FillMatrix(dWv, 0.0);
      end;
      if Length(dWo) = 0 then begin
        SetLength(dWo, Length(Wo), Length(Wo[0]));
        FillMatrix(dWo, 0.0);
      end;
      end;
    end;
  end;
end;

procedure SimpleBackward(var model: TTransformer; const input, gradOutput: TDoubleMatrix);
var i,j: Integer;
  gradInput: TDoubleMatrix;
begin
  WriteLn('BackwardTransformer: начат (упрощенная версия)');
  WriteLn('  gradOutput: ', Length(gradOutput), 'x', Length(gradOutput[0]));

  // Упрощенная версия - только для отладки
  try
    // Простой backward pass
    gradInput := CopyMatrix(gradOutput);

    // Применяем к эмбеддингам
    for i := 0 to High(model.Embedding_Grad) do
      for j := 0 to High(model.Embedding_Grad[0]) do
        model.Embedding_Grad[i][j] := model.Embedding_Grad[i][j] + gradInput[0][j] * 0.01;

    WriteLn('BackwardTransformer: завершен (упрощенная версия)');

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА в BackwardTransformer: ', E.Message);
Halt;
      // Пропускаем backward pass в случае ошибки
    end;
  end;
end;

// Упрощенная версия BackwardLayer для отладки
procedure SimpleBackwardLayer(var Layer: TTransformerLayer; const gradOutput: TDoubleMatrix; out gradInput: TDoubleMatrix);
var
  i,j,k: Integer;
  simpleGradW1, simpleGradW2: TDoubleMatrix;
  sum: Double;
begin
  WriteLn('    SimpleBackwardLayer: начат');
  WriteLn('      gradOutput: ', Length(gradOutput), 'x', IfThen(Length(gradOutput) > 0, IntToStr(Length(gradOutput[0])), '0'));

  try
    // ✅ ИНИЦИАЛИЗИРУЕМ ВСЕ ГРАДИЕНТЫ ЕСЛИ ОНИ ПУСТЫЕ
    if Length(Layer.FFN_weights1_Grad) = 0 then
    begin
      WriteLn('      Инициализируем FFN_weights1_Grad...');
      SetLength(Layer.FFN_weights1_Grad, Length(Layer.FFN_weights1), Length(Layer.FFN_weights1[0]));
      FillMatrix(Layer.FFN_weights1_Grad, 0.0);
      WriteLn('        FFN_weights1_Grad: ', Length(Layer.FFN_weights1_Grad), 'x', 
              IfThen(Length(Layer.FFN_weights1_Grad) > 0, IntToStr(Length(Layer.FFN_weights1_Grad[0])), '0'));
    end;

    if Length(Layer.FFN_weights2_Grad) = 0 then
    begin
      WriteLn('      Инициализируем FFN_weights2_Grad...');
      SetLength(Layer.FFN_weights2_Grad, Length(Layer.FFN_weights2), Length(Layer.FFN_weights2[0]));
      FillMatrix(Layer.FFN_weights2_Grad, 0.0);
      WriteLn('        FFN_weights2_Grad: ', Length(Layer.FFN_weights2_Grad), 'x', 
              IfThen(Length(Layer.FFN_weights2_Grad) > 0, IntToStr(Length(Layer.FFN_weights2_Grad[0])), '0'));
    end;

    // ✅ ИНИЦИАЛИЗИРУЕМ ГРАДИЕНТЫ ATTENTION ЕСЛИ ОНИ ПУСТЫЕ
    for i := 0 to High(Layer.SelfAttention.Heads) do
    begin
      with Layer.SelfAttention.Heads[i] do
      begin
        if Length(dWq) = 0 then
        begin
          WriteLn('      Инициализируем dWq для головы ', i, '...');
          SetLength(dWq, Length(Wq), Length(Wq[0]));
          FillMatrix(dWq, 0.0);
        end;

        if Length(dWk) = 0 then
        begin
          WriteLn('      Инициализируем dWk для головы ', i, '...');
          SetLength(dWk, Length(Wk), Length(Wk[0]));
          FillMatrix(dWk, 0.0);
        end;

        if Length(dWv) = 0 then
        begin
          WriteLn('      Инициализируем dWv для головы ', i, '...');
          SetLength(dWv, Length(Wv), Length(Wv[0]));
          FillMatrix(dWv, 0.0);
        end;

        if Length(dWo) = 0 then
        begin
          WriteLn('      Инициализируем dWo для головы ', i, '...');
          SetLength(dWo, Length(Wo), Length(Wo[0]));
          FillMatrix(dWo, 0.0);
        end;
      end;
    end;

    // ✅ ПРОСТО ПЕРЕДАЕМ ГРАДИЕНТ ДАЛЬШЕ
    gradInput := CopyMatrix(gradOutput);

    // ✅ ПРОСТЫЕ ГРАДИЕНТЫ ДЛЯ ОБНОВЛЕНИЯ ВЕСОВ (если есть кэши)
    if (Length(Layer.cachedInput) > 0) and (Length(gradOutput) > 0) then
    begin
      // Простые градиенты для FFN weights 1
      if (Length(Layer.FFN_weights1) > 0) and (Length(Layer.FFN_weights1_Grad) > 0) then
      begin
        simpleGradW1 := MatrixMultiply(TransposeMatrix(Layer.cachedInput), gradOutput);
        WriteLn('      simpleGradW1: ', Length(simpleGradW1), 'x', 
                IfThen(Length(simpleGradW1) > 0, IntToStr(Length(simpleGradW1[0])), '0'));

        // Добавляем к существующим градиентам
        for i := 0 to Min(High(Layer.FFN_weights1_Grad), High(simpleGradW1)) do
        begin
          for j := 0 to Min(High(Layer.FFN_weights1_Grad[i]), High(simpleGradW1[i])) do
          begin
            Layer.FFN_weights1_Grad[i][j] := Layer.FFN_weights1_Grad[i][j] + simpleGradW1[i][j];
          end;
        end;
      end;

      // Простые градиенты для FFN weights 2  
      if (Length(Layer.FFN_weights2) > 0) and (Length(Layer.FFN_weights2_Grad) > 0) then
      begin
        simpleGradW2 := MatrixMultiply(TransposeMatrix(gradOutput), Layer.cachedInput);
        WriteLn('      simpleGradW2: ', Length(simpleGradW2), 'x', 
                IfThen(Length(simpleGradW2) > 0, IntToStr(Length(simpleGradW2[0])), '0'));

        // Добавляем к существующим градиентам
        for i := 0 to Min(High(Layer.FFN_weights2_Grad), High(simpleGradW2)) do
        begin
          for j := 0 to Min(High(Layer.FFN_weights2_Grad[i]), High(simpleGradW2[i])) do
          begin
            Layer.FFN_weights2_Grad[i][j] := Layer.FFN_weights2_Grad[i][j] + simpleGradW2[i][j];
          end;
        end;
      end;

      // Простые градиенты для LayerNorm (просто копируем градиенты)
      if Length(Layer.Norm1_Gamma_Grad) > 0 then
      begin
        for j := 0 to Min(High(Layer.Norm1_Gamma_Grad), High(gradOutput[0])) do
        begin
          sum := 0.0;
          for i := 0 to High(gradOutput) do
          begin
            if j < Length(gradOutput[i]) then
              sum := sum + gradOutput[i][j];
          end;
          Layer.Norm1_Gamma_Grad[j] := Layer.Norm1_Gamma_Grad[j] + sum;
          Layer.Norm1_Beta_Grad[j] := Layer.Norm1_Beta_Grad[j] + sum;
        end;
      end;

      // Аналогично для Norm2
      if Length(Layer.Norm2_Gamma_Grad) > 0 then
      begin
        for j := 0 to Min(High(Layer.Norm2_Gamma_Grad), High(gradOutput[0])) do
        begin
          sum := 0.0;
          for i := 0 to High(gradOutput) do
          begin
            if j < Length(gradOutput[i]) then
              sum := sum + gradOutput[i][j];
          end;
          Layer.Norm2_Gamma_Grad[j] := Layer.Norm2_Gamma_Grad[j] + sum;
          Layer.Norm2_Beta_Grad[j] := Layer.Norm2_Beta_Grad[j] + sum;
        end;
      end;

      // Простые градиенты для Attention weights
      for i := 0 to High(Layer.SelfAttention.Heads) do
      begin
        with Layer.SelfAttention.Heads[i] do
        begin
          // Градиенты для Wq, Wk, Wv, Wo
          if (Length(Wq) > 0) and (Length(dWq) > 0) then
          begin
            simpleGradW1 := MatrixMultiply(TransposeMatrix(Layer.cachedInput), gradOutput);
            for j := 0 to Min(High(dWq), High(simpleGradW1)) do
            begin
              for k := 0 to Min(High(dWq[j]), High(simpleGradW1[j])) do
              begin
                dWq[j][k] := dWq[j][k] + simpleGradW1[j][k] * 0.1; // Меньший коэффициент для attention
              end;
            end;
          end;

          // Аналогично для Wk, Wv, Wo...
          if (Length(Wk) > 0) and (Length(dWk) > 0) then
          begin
            simpleGradW1 := MatrixMultiply(TransposeMatrix(Layer.cachedInput), gradOutput);
            for j := 0 to Min(High(dWk), High(simpleGradW1)) do
            begin
              for k := 0 to Min(High(dWk[j]), High(simpleGradW1[j])) do
              begin
                dWk[j][k] := dWk[j][k] + simpleGradW1[j][k] * 0.1;
              end;
            end;
          end;

          if (Length(Wv) > 0) and (Length(dWv) > 0) then
          begin
            simpleGradW1 := MatrixMultiply(TransposeMatrix(Layer.cachedInput), gradOutput);
            for j := 0 to Min(High(dWv), High(simpleGradW1)) do
            begin
              for k := 0 to Min(High(dWv[j]), High(simpleGradW1[j])) do
              begin
                dWv[j][k] := dWv[j][k] + simpleGradW1[j][k] * 0.1;
              end;
            end;
          end;

          if (Length(Wo) > 0) and (Length(dWo) > 0) then
          begin
            simpleGradW1 := MatrixMultiply(TransposeMatrix(Layer.cachedInput), gradOutput);
            for j := 0 to Min(High(dWo), High(simpleGradW1)) do
            begin
              for k := 0 to Min(High(dWo[j]), High(simpleGradW1[j])) do
              begin
                dWo[j][k] := dWo[j][k] + simpleGradW1[j][k] * 0.1;
              end;
            end;
          end;
        end;
      end;
    end;

    WriteLn('    SimpleBackwardLayer: завершен успешно');
    WriteLn('      gradInput: ', Length(gradInput), 'x', 
            IfThen(Length(gradInput) > 0, IntToStr(Length(gradInput[0])), '0'));

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА в SimpleBackwardLayer: ', E.Message);
      WriteLn('  Тип ошибки: ', E.ClassName);
Halt;
      gradInput := CopyMatrix(gradOutput);
    end;
  end;
end;

procedure BackwardTransformer(var Model: TTransformer; const Input: TDoubleMatrix; const gradOutput: TDoubleMatrix);
var
  i,j,k: Integer;
  currentGrad, layerGrad,embeddingGrad: TDoubleMatrix;
  gradGamma, gradBeta: TDoubleArray;
begin
  WriteLn('BackwardTransformer: начат (полная версия)');

  // ✅ ПРИНУДИТЕЛЬНАЯ ИНИЦИАЛИЗАЦИЯ ГРАДИЕНТОВ
  InitializeAllGradients(Model);

  WriteLn('  gradOutput: ', Length(gradOutput), 'x', IfThen(Length(gradOutput) > 0, IntToStr(Length(gradOutput[0])), '0'));
  WriteLn('  Input: ', Length(Input), 'x', IfThen(Length(Input) > 0, IntToStr(Length(Input[0])), '0'));

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(gradOutput) = 0) or (Length(gradOutput[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустой gradOutput в BackwardTransformer');
Halt;
    Exit;
  end;

  if (Length(Input) = 0) or (Length(Input[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустой Input в BackwardTransformer');
Halt;
    Exit;
  end;

  try
    currentGrad := CopyMatrix(gradOutput);

    // Обратный проход через все слои (в обратном порядке)
    for i := High(Model.Layers) downto 0 do begin
      WriteLn('  Слой ', i, ':');
      WriteLn('    currentGrad: ', Length(currentGrad), 'x', IfThen(Length(currentGrad) > 0, IntToStr(Length(currentGrad[0])), '0'));

      // ✅ ИСПРАВЛЕНИЕ: Используем правильную сигнатуру LayerNormBackward
      SetLength(layerGrad, 0, 0);

      // Временно используем упрощенную версию для отладки
      SimpleBackwardLayer(Model.Layers[i], currentGrad, layerGrad);

      // Когда упрощенная версия работает, можно раскомментировать:
      // BackwardLayer(Model.Layers[i], currentGrad, layerGrad);

      currentGrad := layerGrad;
      WriteLn('    currentGrad после слоя: ', Length(currentGrad), 'x', IfThen(Length(currentGrad) > 0, IntToStr(Length(currentGrad[0])), '0'));
    end;

    // ✅ ИСПРАВЛЕНИЕ: Backward через embedding слой
    WriteLn('  Embedding backward...');
    if (Length(currentGrad) > 0) and (Length(Input) > 0) then
    begin
      // Градиент для embedding матрицы: dL/dEmbedding = Input^T * currentGrad
      embeddingGrad := MatrixMultiply(TransposeMatrix(Input), currentGrad);

      // Обновляем градиенты embedding
      for j := 0 to High(Model.Embedding_Grad) do
      begin
        for k := 0 to High(Model.Embedding_Grad[j]) do
        begin
          if (j < Length(embeddingGrad)) and (k < Length(embeddingGrad[j])) then
            Model.Embedding_Grad[j][k] := Model.Embedding_Grad[j][k] + embeddingGrad[j][k];
        end;
      end;
    end;

    WriteLn('BackwardTransformer: завершен успешно');

  except
    on E: Exception do
    begin
      WriteLn('ОШИБКА в BackwardTransformer: ', E.Message);
      WriteLn('  Тип ошибки: ', E.ClassName);
Halt;
    end;
  end;
end;

procedure ApplyL2ToMatrix(var Matrix: TDoubleMatrix; WeightDecay, LearningRate: Double);
var
  i, j: Integer;
begin
  for i := 0 to High(Matrix) do
    for j := 0 to High(Matrix[i]) do
      Matrix[i][j] := Matrix[i][j] * (1 - LearningRate * WeightDecay);
end;

// Простая функция обновления матрицы
procedure UpdateMatrixSimple(var params, grads: TDoubleMatrix; learningRate: Double);
var i, j: Integer;
begin
  if (Length(params) = 0) or (Length(grads) = 0) then Exit;
  for i := 0 to Min(High(params), High(grads)) do begin
    for j := 0 to Min(High(params[i]), High(grads[i])) do begin
      params[i][j] := params[i][j] - learningRate * grads[i][j];
    end;
  end;
end;

procedure ApplyWeightDecay(var Model: TTransformer; LearningRate: Double);
var
  i, j, k, l: Integer;
  decayFactor: Double;
begin
  decayFactor := 1.0 - LearningRate * Model.Config.WeightDecay;

  // Embedding weights
  for i := 0 to High(Model.Embedding) do
    for j := 0 to High(Model.Embedding[i]) do
      Model.Embedding[i][j] := Model.Embedding[i][j] * decayFactor;

  // Layers weights
  for i := 0 to High(Model.Layers) do begin
    // Attention weights
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do begin
      for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wq) do
        for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wq[k]) do
          Model.Layers[i].SelfAttention.Heads[j].Wq[k][l] := 
            Model.Layers[i].SelfAttention.Heads[j].Wq[k][l] * decayFactor;

      for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wk) do
        for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wk[k]) do
          Model.Layers[i].SelfAttention.Heads[j].Wk[k][l] := 
            Model.Layers[i].SelfAttention.Heads[j].Wk[k][l] * decayFactor;
      for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wv) do
        for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wv[k]) do
          Model.Layers[i].SelfAttention.Heads[j].Wv[k][l] := 
            Model.Layers[i].SelfAttention.Heads[j].Wv[k][l] * decayFactor;
      for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wo) do
        for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wo[k]) do
          Model.Layers[i].SelfAttention.Heads[j].Wo[k][l] := 
            Model.Layers[i].SelfAttention.Heads[j].Wo[k][l] * decayFactor;
    end;

    // FFN weights
    for j := 0 to High(Model.Layers[i].FFN_weights1) do
      for k := 0 to High(Model.Layers[i].FFN_weights1[j]) do
        Model.Layers[i].FFN_weights1[j][k] := 
          Model.Layers[i].FFN_weights1[j][k] * decayFactor;

    for j := 0 to High(Model.Layers[i].FFN_weights2) do
      for k := 0 to High(Model.Layers[i].FFN_weights2[j]) do
        Model.Layers[i].FFN_weights2[j][k] := 
          Model.Layers[i].FFN_weights2[j][k] * decayFactor;
  end;
end;

// ДОБАВЬТЕ эту функцию для безопасного обновления:
procedure SafeUpdateMatrixWithAdam(var params, grads: TDoubleMatrix; 
                                 var state: TAdamState; learningRate: Double);
var
  i, j: Integer;
begin
  WriteLn('        SafeUpdateMatrixWithAdam: начат');
  WriteLn('          params: ', Length(params), 'x', 
          IfThen(Length(params) > 0, IntToStr(Length(params[0])), '0'));
  WriteLn('          grads: ', Length(grads), 'x', 
          IfThen(Length(grads) > 0, IntToStr(Length(grads[0])), '0'));

  // ✅ ПРОВЕРЯЕМ РАЗМЕРЫ И ИНИЦИАЛИЗИРУЕМ СОСТОЯНИЯ ADAM ЕСЛИ НУЖНО
  if (Length(params) = 0) or (Length(grads) = 0) then
  begin
    WriteLn('          ⚠ Пропускаем - пустые матрицы');
    Exit;
  end;

  if (Length(params) <> Length(grads)) or (Length(params[0]) <> Length(grads[0])) then
  begin
    WriteLn('          ⚠ Пропускаем - несовпадение размеров');
    WriteLn('            params: ', Length(params), 'x', Length(params[0]));
    WriteLn('            grads: ', Length(grads), 'x', Length(grads[0]));
    Exit;
  end;

  // ✅ ПРОВЕРЯЕМ ИНИЦИАЛИЗАЦИЮ СОСТОЯНИЙ ADAM
  if (Length(state.M) = 0) or (Length(state.V) = 0) then
  begin
    WriteLn('          Инициализируем состояния Adam...');
    InitAdamState(state, Length(params), Length(params[0]));
  end;

  try
    UpdateMatrixWithAdam(params, grads, state, learningRate);
    WriteLn('          ✓ Матрица успешно обновлена');
  except
    on E: Exception do
    begin
      WriteLn('          ⚠ Ошибка при обновлении: ', E.Message);
      WriteLn('          Тип ошибки: ', E.ClassName);
Halt;
    end;
  end;
end;

// ДОБАВЬТЕ эту функцию для безопасного обновления LayerNorm:
procedure SafeUpdateLayerNormParams(var params, grads: TDoubleArray; 
                                  var state: TAdamVectorState; learningRate: Double);
var
  i: Integer;
begin
  WriteLn('        SafeUpdateLayerNormParams: начат');
  WriteLn('          params length: ', Length(params));
  WriteLn('          grads length: ', Length(grads));

  // ✅ ПРОВЕРЯЕМ РАЗМЕРЫ
  if (Length(params) = 0) or (Length(grads) = 0) then
  begin
    WriteLn('          ⚠ Пропускаем - пустые массивы');
Halt;
    Exit;
  end;

  if Length(params) <> Length(grads) then
  begin
    WriteLn('          ⚠ Пропускаем - несовпадение размеров');
    WriteLn('            params: ', Length(params));
    WriteLn('            grads: ', Length(grads));
Halt;
    Exit;
  end;

  // ✅ ПРОВЕРЯЕМ ИНИЦИАЛИЗАЦИЮ СОСТОЯНИЙ ADAM
  if (Length(state.M) = 0) or (Length(state.V) = 0) then
  begin
    WriteLn('          Инициализируем состояния Adam...');
    InitAdamVectorState(state, Length(params));
  end;

  try
    UpdateVectorAdam(params, grads, state, learningRate);
    WriteLn('          ✓ LayerNorm параметры успешно обновлены');
  except
    on E: Exception do
    begin
      WriteLn('          ⚠ Ошибка при обновлении LayerNorm: ', E.Message);
      WriteLn('          Тип ошибки: ', E.ClassName);
Halt;
    end;
  end;
end;

procedure UpdateTransformer(var model: TTransformer; learningRate: Double);
var
  i, j: Integer;
begin
  WriteLn('UpdateTransformer: начат');
  WriteLn('  learningRate: ', learningRate:0:6);

  try
    // ✅ ДИАГНОСТИКА: Проверим размеры всех градиентов перед обновлением
    WriteLn('  Диагностика градиентов перед обновлением:');

    // Проверка embedding градиентов
    WriteLn('    Embedding_Grad: ', Length(model.Embedding_Grad), 'x', 
            IfThen(Length(model.Embedding_Grad) > 0, IntToStr(Length(model.Embedding_Grad[0])), '0'));

    for i := 0 to High(model.Layers) do
    begin
      WriteLn('    Слой ', i, ':');
      WriteLn('      FFN1_Grad: ', Length(model.Layers[i].FFN1_Grad), 'x', 
              IfThen(Length(model.Layers[i].FFN1_Grad) > 0, IntToStr(Length(model.Layers[i].FFN1_Grad[0])), '0'));
      WriteLn('      FFN2_Grad: ', Length(model.Layers[i].FFN2_Grad), 'x', 
              IfThen(Length(model.Layers[i].FFN2_Grad) > 0, IntToStr(Length(model.Layers[i].FFN2_Grad[0])), '0'));

      for j := 0 to High(model.Layers[i].SelfAttention.Heads) do
      begin
        WriteLn('      Голова ', j, ':');
        WriteLn('        dWq: ', Length(model.Layers[i].SelfAttention.Heads[j].dWq), 'x', 
                IfThen(Length(model.Layers[i].SelfAttention.Heads[j].dWq) > 0, 
                       IntToStr(Length(model.Layers[i].SelfAttention.Heads[j].dWq[0])), '0'));
      end;
    end;

    // 1. Обновление эмбеддингов
    WriteLn('  Обновление эмбеддингов...');

    for i := 0 to High(model.Embedding) do 
    begin
      for j := 0 to High(model.Embedding[i]) do 
      begin
        // ✅ ЗАЩИТА: Проверяем границы массивов
        if (i < Length(model.Embedding_Grad)) and 
           (j < Length(model.Embedding_Grad[i])) then 
        begin
          model.Embedding[i][j] := model.Embedding[i][j] - learningRate * model.Embedding_Grad[i][j];
        end;
      end;
    end;

    // 2. Обновление всех слоев
    WriteLn('  Обновление слоев...');
    for i := 0 to High(model.Layers) do 
    begin
      WriteLn('    Слой ', i, ':');

      // Attention weights
      WriteLn('      Attention heads...');
      for j := 0 to High(model.Layers[i].SelfAttention.Heads) do 
      begin
        WriteLn('        Голова ', j, ':');

        // ✅ ЗАЩИТА: Добавляем проверки для каждой матрицы
        // Wq
        if (Length(model.Layers[i].SelfAttention.Heads[j].Wq) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].dWq) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wq) = Length(model.Layers[i].SelfAttention.Heads[j].dWq)) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wq[0]) = Length(model.Layers[i].SelfAttention.Heads[j].dWq[0])) then
        begin
          SafeUpdateMatrixWithAdam(model.Layers[i].SelfAttention.Heads[j].Wq, 
                             model.Layers[i].SelfAttention.Heads[j].dWq, 
                             model.Layers[i].SelfAttention.Heads[j].Wq_AdamState, 
                             learningRate);
        end
        else
        begin
          WriteLn('        Предупреждение: Пропускаем Wq - несовпадение размеров');
        end;

        // Аналогично для Wk, Wv, Wo...
        // Wk
        if (Length(model.Layers[i].SelfAttention.Heads[j].Wk) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].dWk) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wk) = Length(model.Layers[i].SelfAttention.Heads[j].dWk)) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wk[0]) = Length(model.Layers[i].SelfAttention.Heads[j].dWk[0])) then
        begin
          SafeUpdateMatrixWithAdam(model.Layers[i].SelfAttention.Heads[j].Wk, 
                             model.Layers[i].SelfAttention.Heads[j].dWk, 
                             model.Layers[i].SelfAttention.Heads[j].Wk_AdamState, 
                             learningRate);
        end;

        // Wv
        if (Length(model.Layers[i].SelfAttention.Heads[j].Wv) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].dWv) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wv) = Length(model.Layers[i].SelfAttention.Heads[j].dWv)) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wv[0]) = Length(model.Layers[i].SelfAttention.Heads[j].dWv[0])) then
        begin
          SafeUpdateMatrixWithAdam(model.Layers[i].SelfAttention.Heads[j].Wv, 
                             model.Layers[i].SelfAttention.Heads[j].dWv, 
                             model.Layers[i].SelfAttention.Heads[j].Wv_AdamState, 
                             learningRate);
        end;

        // Wo
        if (Length(model.Layers[i].SelfAttention.Heads[j].Wo) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].dWo) > 0) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wo) = Length(model.Layers[i].SelfAttention.Heads[j].dWo)) and
           (Length(model.Layers[i].SelfAttention.Heads[j].Wo[0]) = Length(model.Layers[i].SelfAttention.Heads[j].dWo[0])) then
        begin
          SafeUpdateMatrixWithAdam(model.Layers[i].SelfAttention.Heads[j].Wo, 
                             model.Layers[i].SelfAttention.Heads[j].dWo, 
                             model.Layers[i].SelfAttention.Heads[j].Wo_AdamState, 
                             learningRate);
        end;
      end;

// ДОБАВЬТЕ детальную проверку перед обновлением FFN:
WriteLn('      FFN weights...');
if (Length(model.Layers[i].FFN_weights1) > 0) and
   (Length(model.Layers[i].FFN1_Grad) > 0) then
begin
  WriteLn('        FFN1 размеры:');
  WriteLn('          weights: ', Length(model.Layers[i].FFN_weights1), 'x', 
          Length(model.Layers[i].FFN_weights1[0]));
  WriteLn('          grads: ', Length(model.Layers[i].FFN1_Grad), 'x', 
          Length(model.Layers[i].FFN1_Grad[0]));

  // ✅ ПРОВЕРЯЕМ СОВПАДЕНИЕ РАЗМЕРОВ
  if (Length(model.Layers[i].FFN_weights1) = Length(model.Layers[i].FFN1_Grad)) and
     (Length(model.Layers[i].FFN_weights1[0]) = Length(model.Layers[i].FFN1_Grad[0])) then
  begin
    SafeUpdateMatrixWithAdam(model.Layers[i].FFN_weights1, 
                       model.Layers[i].FFN1_Grad, 
                       model.Layers[i].FFN1_AdamState, 
                       learningRate);
    WriteLn('        ✓ FFN1 обновлены');
  end
  else
  begin
    WriteLn('        ⚠ Пропускаем FFN1 - несовпадение размеров');
    WriteLn('          weights rows: ', Length(model.Layers[i].FFN_weights1), 
            ' vs grads rows: ', Length(model.Layers[i].FFN1_Grad));
    WriteLn('          weights cols: ', Length(model.Layers[i].FFN_weights1[0]), 
            ' vs grads cols: ', Length(model.Layers[i].FFN1_Grad[0]));
  end;
end
else
begin
  WriteLn('        ⚠ Пропускаем FFN1 - пустые данные');
end;


if (Length(model.Layers[i].FFN_weights2) > 0) and
   (Length(model.Layers[i].FFN2_Grad) > 0) then
begin
  WriteLn('        FFN2 размеры:');
  WriteLn('          weights: ', Length(model.Layers[i].FFN_weights2), 'x', 
          Length(model.Layers[i].FFN_weights2[0]));
  WriteLn('          grads: ', Length(model.Layers[i].FFN2_Grad), 'x', 
          Length(model.Layers[i].FFN2_Grad[0]));

  // ✅ ПРОВЕРЯЕМ СОВПАДЕНИЕ РАЗМЕРОВ
  if (Length(model.Layers[i].FFN_weights2) = Length(model.Layers[i].FFN2_Grad)) and
     (Length(model.Layers[i].FFN_weights2[0]) = Length(model.Layers[i].FFN2_Grad[0])) then
  begin
    SafeUpdateMatrixWithAdam(model.Layers[i].FFN_weights2, 
                       model.Layers[i].FFN2_Grad, 
                       model.Layers[i].FFN2_AdamState, 
                       learningRate);
    WriteLn('        ✓ FFN2 обновлены');
  end
  else
  begin
    WriteLn('        ⚠ Пропускаем FFN2 - несовпадение размеров');
    WriteLn('          weights rows: ', Length(model.Layers[i].FFN_weights2), 
            ' vs grads rows: ', Length(model.Layers[i].FFN2_Grad));
    WriteLn('          weights cols: ', Length(model.Layers[i].FFN_weights2[0]), 
            ' vs grads cols: ', Length(model.Layers[i].FFN2_Grad[0]));
  end;
end
else
begin
  WriteLn('        ⚠ Пропускаем FFN2 - пустые данные');
end;

// LayerNorm parameters
WriteLn('      LayerNorm parameters...');
// Norm1_Gamma
if (Length(model.Layers[i].Norm1_Gamma) > 0) and (Length(model.Layers[i].Norm1_Gamma_Grad) > 0) then
begin
  SafeUpdateLayerNormParams(model.Layers[i].Norm1_Gamma, 
                          model.Layers[i].Norm1_Gamma_Grad, 
                          model.Layers[i].Norm1_Gamma_AdamState, 
                          learningRate);
end else begin
  WriteLn('        ⚠ Пропускаем Norm1_Gamma - пустые данные');
end;
// Norm1_Beta
if (Length(model.Layers[i].Norm1_Beta) > 0) and (Length(model.Layers[i].Norm1_Beta_Grad) > 0) then begin
  SafeUpdateLayerNormParams(model.Layers[i].Norm1_Beta, 
                          model.Layers[i].Norm1_Beta_Grad, 
                          model.Layers[i].Norm1_Beta_AdamState, 
                          learningRate);
end else begin
  WriteLn('        ⚠ Пропускаем Norm1_Beta - пустые данные');
end;
// Norm2_Gamma
if (Length(model.Layers[i].Norm2_Gamma) > 0) and (Length(model.Layers[i].Norm2_Gamma_Grad) > 0) then begin
  SafeUpdateLayerNormParams(model.Layers[i].Norm2_Gamma, 
                          model.Layers[i].Norm2_Gamma_Grad, 
                          model.Layers[i].Norm2_Gamma_AdamState, 
                          learningRate);
end else begin
  WriteLn('        ⚠ Пропускаем Norm2_Gamma - пустые данные');
end;
// Norm2_Beta
if (Length(model.Layers[i].Norm2_Beta) > 0) and (Length(model.Layers[i].Norm2_Beta_Grad) > 0) then begin
  SafeUpdateLayerNormParams(model.Layers[i].Norm2_Beta, 
                          model.Layers[i].Norm2_Beta_Grad, 
                          model.Layers[i].Norm2_Beta_AdamState, 
                          learningRate);
end else begin
  WriteLn('        ⚠ Пропускаем Norm2_Beta - пустые данные');
end;

    end;


    // 3. Применяем weight decay
    if model.Config.WeightDecay > 0 then begin
      WriteLn('  Применение weight decay...');
      ApplyWeightDecay(model, learningRate);
    end;

    // 4. Сброс градиентов
    WriteLn('  Сброс градиентов...');
    ResetGradients(model);

    WriteLn('UpdateTransformer: завершен');

  except
    on E: Exception do 
    begin
      WriteLn('ОШИБКА в UpdateTransformer: ', E.Message);
      WriteLn('  Тип ошибки: ', E.ClassName);
      WriteLn('Продолжаем без обновления весов');
Halt;
    end;
  end;
end;

procedure CheckModelDimensions(const Model: TTransformer);
var i,j: Integer;
begin
  WriteLn('=== ПРОВЕРКА РАЗМЕРНОСТЕЙ МОДЕЛИ ===');
  WriteLn('Config.InputSize: ', Model.Config.InputSize);
  WriteLn('Embedding: ', Length(Model.Embedding), 'x', Length(Model.Embedding[0]));

  for i := 0 to High(Model.Layers) do begin
    WriteLn('Слой ', i, ':');

    // Проверка FFN weights
    WriteLn('  FFN1: ', Length(Model.Layers[i].FFN_weights1), 'x', Length(Model.Layers[i].FFN_weights1[0]));
    WriteLn('  FFN2: ', Length(Model.Layers[i].FFN_weights2), 'x', Length(Model.Layers[i].FFN_weights2[0]));

    // Проверка LayerNorm
    WriteLn('  Norm1_Gamma: ', Length(Model.Layers[i].Norm1_Gamma));
    WriteLn('  Norm1_Beta: ', Length(Model.Layers[i].Norm1_Beta));
    WriteLn('  Norm2_Gamma: ', Length(Model.Layers[i].Norm2_Gamma));
    WriteLn('  Norm2_Beta: ', Length(Model.Layers[i].Norm2_Beta));

    // Проверка Attention
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
      with Model.Layers[i].SelfAttention.Heads[j] do begin
      WriteLn('  Head ', j, ':');
      WriteLn('    Wq: ', Length(Wq), 'x', Length(Wq[0]));
      WriteLn('    Wk: ', Length(Wk), 'x', Length(Wk[0]));
      WriteLn('    Wv: ', Length(Wv), 'x', Length(Wv[0]));
      WriteLn('    Wo: ', Length(Wo), 'x', Length(Wo[0]));
      end;
  end;
end;

function SumMultiply(const a, b: TDoubleArray): Double;
var i: Integer;
begin
Result := 0.0;
for i := 0 to Min(High(a), High(b)) do Result := Result + a[i] * b[i];
end;

function SimpleLayerNormBackward(const gradOutput: TDoubleMatrix; const gamma: TDoubleArray): TDoubleMatrix;
var
  i, j: Integer;
begin
  SetLength(Result, Length(gradOutput), Length(gradOutput[0]));
  for i := 0 to High(gradOutput) do
    for j := 0 to High(gradOutput[0]) do
      if j < Length(gamma) then
        Result[i][j] := gradOutput[i][j] * gamma[j]
      else
        Result[i][j] := gradOutput[i][j];
end;

// В Transformer.pas исправим LayerNormBackward
function LayerNormBackward(const gradOutput, normOutput, input: TDoubleMatrix;
                         const gamma: TDoubleArray;
                         out gradGamma, gradBeta: TDoubleArray): TDoubleMatrix;
var
  i, j, size: Integer;
  mean, variance, stddev: Double;
  dNorm, dVar, dMean, dX: TDoubleArray;
  sum1, sum2: Double;
begin
  WriteLn('    LayerNormBackward: начат');
  WriteLn('      gradOutput: ', Length(gradOutput), 'x', Length(gradOutput[0]));
  WriteLn('      normOutput: ', Length(normOutput), 'x', Length(normOutput[0]));
  WriteLn('      input: ', Length(input), 'x', Length(input[0]));
  WriteLn('      gamma: ', Length(gamma));

  // ✅ ЗАЩИТА: Проверяем размерности
  if (Length(gradOutput) = 0) or (Length(normOutput) = 0) or (Length(input) = 0) then begin
    WriteLn('ОШИБКА: Пустые входы в LayerNormBackward');
Halt;
    SetLength(Result, 0, 0);
    Exit;
  end;

  size := Length(input[0]);
  SetLength(Result, Length(input), size);
  SetLength(gradGamma, size);
  SetLength(gradBeta, size);
  SetLength(dNorm, size);
  SetLength(dVar, size);
  SetLength(dMean, size);
  SetLength(dX, size);

  FillArray(gradGamma, 0.0);
  FillArray(gradBeta, 0.0);

  for i := 0 to High(input) do begin
    // Вычисляем mean и variance для текущей строки
    mean := 0.0;
    for j := 0 to size - 1 do
      mean := mean + input[i][j];
    mean := mean / size;

    variance := 0.0;
    for j := 0 to size - 1 do
      variance := variance + Sqr(input[i][j] - mean);
    variance := variance / size;
    stddev := Sqrt(variance + 1e-8);

    // Вычисляем градиенты
    for j := 0 to size - 1 do begin
      // Градиент для gamma и beta
      gradGamma[j] := gradGamma[j] + gradOutput[i][j] * ((input[i][j] - mean) / stddev);
      gradBeta[j] := gradBeta[j] + gradOutput[i][j];

      // Градиент для нормализованного входа
      dNorm[j] := gradOutput[i][j] * gamma[j];
    end;

    // Градиент для variance
    for j := 0 to size - 1 do dVar[j] := dNorm[j] * (input[i][j] - mean) * (-0.5) * Power(variance + 1e-8, -1.5);

    // Градиент для mean
    for j := 0 to size - 1 do dMean[j] := dNorm[j] * (-1.0 / stddev) + dVar[j] * (-2.0 * (input[i][j] - mean)) / size;

    // Градиент для входа
    for j := 0 to size - 1 do begin
      dX[j] := dNorm[j] / stddev + dVar[j] * 2.0 * (input[i][j] - mean) / size + dMean[j] / size;
      Result[i][j] := dX[j];
    end;
  end;

  WriteLn('    LayerNormBackward: завершен');
end;

function ReLUDerivative(const m: TDoubleMatrix): TDoubleMatrix;
var
  i, j: Integer;
begin
  SetLength(Result, Length(m), Length(m[0]));
  for i := 0 to High(m) do
    for j := 0 to High(m[0]) do
      Result[i][j] := IfThen(m[i][j] > 0, 1.0, 0.0);
end;

// В Transformer.pas исправим FFNBackward
function FFNBackward(var Layer: TTransformerLayer; const gradOutput: TDoubleMatrix; const ffnInput: TDoubleMatrix): TDoubleMatrix;
var
  gradHidden, gradInput,gradW1,gradW2: TDoubleMatrix;
i,j:Integer;
begin
  WriteLn('      FFNBackward: начат');
  WriteLn('        gradOutput: ', Length(gradOutput), 'x', Length(gradOutput[0]));
  WriteLn('        ffnInput: ', Length(ffnInput), 'x', Length(ffnInput[0]));

  try
    // ✅ ЗАЩИТА: Проверяем входные данные
    if (Length(gradOutput) = 0) or (Length(ffnInput) = 0) then begin
      WriteLn('ОШИБКА: Пустые входы в FFNBackward');
Halt;
      SetLength(Result, 0, 0);
      Exit;
    end;

    // 1. Backward через второй линейный слой (FFN_weights2)
    WriteLn('        Backward через FFN_weights2...');

    // Градиент для FFN_weights2: dL/dW2 = hidden^T * gradOutput
    if (Length(Layer.cachedFFNOutput) > 0) then begin
      gradW2 := MatrixMultiply(TransposeMatrix(Layer.cachedFFNOutput), gradOutput);
      Layer.FFN_weights2_Grad := AddMatrices(Layer.FFN_weights2_Grad, gradW2);
    end;

    // Градиент для hidden: dL/dhidden = gradOutput * W2^T
    if (Length(Layer.FFN_weights2) > 0) then begin
      gradHidden := MatrixMultiply(gradOutput, TransposeMatrix(Layer.FFN_weights2));
    end else begin
      WriteLn('        Предупреждение: FFN_weights2 пустые');
      gradHidden := CopyMatrix(gradOutput);
    end;

    // 2. Backward через ReLU
    WriteLn('        Backward через ReLU...');
    // Для ReLU: gradient = 0 если вход был <= 0, иначе = gradient
    if (Length(Layer.cachedNorm1Output) > 0) then begin
      for i := 0 to High(gradHidden) do begin
        for j := 0 to High(gradHidden[i]) do begin
          if Layer.cachedNorm1Output[i][j] <= 0 then
            gradHidden[i][j] := 0.0;
        end;
      end;
    end;

    // 3. Backward через первый линейный слой (FFN_weights1)
    WriteLn('        Backward через FFN_weights1...');

    // Градиент для FFN_weights1: dL/dW1 = input^T * gradHidden
    gradW1 := MatrixMultiply(TransposeMatrix(ffnInput), gradHidden);
    Layer.FFN_weights1_Grad := AddMatrices(Layer.FFN_weights1_Grad, gradW1);

    // Градиент для входа: dL/dinput = gradHidden * W1^T
    if (Length(Layer.FFN_weights1) > 0) then begin
      gradInput := MatrixMultiply(gradHidden, TransposeMatrix(Layer.FFN_weights1));
    end else begin
      WriteLn('        Предупреждение: FFN_weights1 пустые');
      gradInput := CopyMatrix(gradHidden);
    end;

    Result := gradInput;
    WriteLn('      FFNBackward: завершен');

  except
    on E: Exception do begin
      WriteLn('ОШИБКА в FFNBackward: ', E.Message);
Halt;
      Result := CopyMatrix(gradOutput);
    end;
  end;
end;

procedure EmbeddingBackward(var model: TTransformer; const gradOutput: TDoubleMatrix; const input: TDoubleMatrix);
var
  gradEmbed: TDoubleMatrix;
  i, j: Integer;
begin
  // Градиент для embedding матрицы
  // gradOutput: [batch_size x embedding_size]
  // input: [batch_size x vocab_size] (one-hot или индексы)

  // Упрощенная версия: предполагаем, что input содержит one-hot векторы
  gradEmbed := MatrixMultiply(TransposeMatrix(gradOutput), input);

  // Accumulate gradient
  for i := 0 to High(model.Embedding_Grad) do
    for j := 0 to High(model.Embedding_Grad[0]) do
      model.Embedding_Grad[i][j] := model.Embedding_Grad[i][j] + gradEmbed[i][j];
end;

procedure CheckAdamStates(const Model: TTransformer);
var i,j: Integer;
begin
  WriteLn('=== ПРОВЕРКА СОСТОЯНИЙ ADAM ===');
  WriteLn('Embedding_AdamState:');
  WriteLn('  M: ', Length(Model.Embedding_AdamState.M), 'x', Length(Model.Embedding_AdamState.M[0]));
  WriteLn('  V: ', Length(Model.Embedding_AdamState.V), 'x', Length(Model.Embedding_AdamState.V[0]));
  for i := 0 to High(Model.Layers) do begin
    WriteLn('Слой ', i, ':');
    WriteLn('  FFN1_AdamState:');
    WriteLn('    M: ', Length(Model.Layers[i].FFN1_AdamState.M), 'x', Length(Model.Layers[i].FFN1_AdamState.M[0]));
    WriteLn('    V: ', Length(Model.Layers[i].FFN1_AdamState.V), 'x', Length(Model.Layers[i].FFN1_AdamState.V[0]));
    WriteLn('  FFN2_AdamState:');
    WriteLn('    M: ', Length(Model.Layers[i].FFN2_AdamState.M), 'x', Length(Model.Layers[i].FFN2_AdamState.M[0]));
    WriteLn('    V: ', Length(Model.Layers[i].FFN2_AdamState.V), 'x', Length(Model.Layers[i].FFN2_AdamState.V[0]));
    WriteLn('  Norm1_Gamma_AdamState:');
    WriteLn('    M: ', Length(Model.Layers[i].Norm1_Gamma_AdamState.M));
    WriteLn('    V: ', Length(Model.Layers[i].Norm1_Gamma_AdamState.V));
    // ... аналогично для других состояний ...
  end;
end;

procedure ApplyPreLayerNorm(var model: TTransformer; const input: TDoubleMatrix; out normalized: TDoubleMatrix);
begin
  if model.Config.UseLayerNorm and (Length(model.PreNormGamma) > 0) then begin
    normalized := LayerNorm(input, model.PreNormGamma, model.PreNormBeta);
  end else begin
    // Если LayerNorm не используется, просто копируем вход
    normalized := CopyMatrix(input);
  end;
end;

procedure ApplyGradientClippingToModel(var Model: TTransformer; maxNorm: Double);
var
  totalNorm, scale: Double;
i,j:Integer;

  procedure ProcessMatrix(const Matrix: TDoubleMatrix);
  var
    i, j: Integer;
  begin
    for i := 0 to High(Matrix) do
      for j := 0 to High(Matrix[i]) do
        totalNorm := totalNorm + Sqr(Matrix[i][j]);
  end;

  procedure ProcessVector(const Vector: TDoubleArray);
  var i: Integer;
  begin
    for i := 0 to High(Vector) do totalNorm := totalNorm + Sqr(Vector[i]);
  end;

  procedure ScaleMatrix(var Matrix: TDoubleMatrix; s: Double);
  var i,j: Integer;
  begin
    for i := 0 to High(Matrix) do
      for j := 0 to High(Matrix[i]) do
        Matrix[i][j] := Matrix[i][j] * s;
  end;

  procedure ScaleVector(var Vector: TDoubleArray; s: Double);
  var i: Integer;
  begin
    for i := 0 to High(Vector) do Vector[i] := Vector[i] * s;
  end;

begin
  totalNorm := 0.0;

  // Embedding градиенты
  ProcessMatrix(Model.Embedding_Grad);

  // Градиенты всех слоев
  for i := 0 to High(Model.Layers) do begin
    // Attention градиенты
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
      with Model.Layers[i].SelfAttention.Heads[j] do begin
      ProcessMatrix(dWq);
      ProcessMatrix(dWk);
      ProcessMatrix(dWv);
      ProcessMatrix(dWo);
      end;

    // FFN градиенты
    ProcessMatrix(Model.Layers[i].FFN1_Grad);
    ProcessMatrix(Model.Layers[i].FFN2_Grad);

    // LayerNorm градиенты
    ProcessVector(Model.Layers[i].Norm1_Gamma_Grad);
    ProcessVector(Model.Layers[i].Norm1_Beta_Grad);
    ProcessVector(Model.Layers[i].Norm2_Gamma_Grad);
    ProcessVector(Model.Layers[i].Norm2_Beta_Grad);
  end;

  totalNorm := Sqrt(totalNorm);

  // Применяем clipping если норма превышена
  if totalNorm > maxNorm then begin
    scale := maxNorm / (totalNorm + 1e-8);
    WriteLn('Gradient clipping: ', totalNorm:0:4, ' -> ', maxNorm:0:4);

    // Применяем scaling ко всем градиентам
    ScaleMatrix(Model.Embedding_Grad, scale);

    for i := 0 to High(Model.Layers) do begin
      // Attention градиенты
      for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
        with Model.Layers[i].SelfAttention.Heads[j] do begin
        ScaleMatrix(dWq, scale);
        ScaleMatrix(dWk, scale);
        ScaleMatrix(dWv, scale);
        ScaleMatrix(dWo, scale);
        end;

      // FFN градиенты
      ScaleMatrix(Model.Layers[i].FFN1_Grad, scale);
      ScaleMatrix(Model.Layers[i].FFN2_Grad, scale);

      // LayerNorm градиенты
      ScaleVector(Model.Layers[i].Norm1_Gamma_Grad, scale);
      ScaleVector(Model.Layers[i].Norm1_Beta_Grad, scale);
      ScaleVector(Model.Layers[i].Norm2_Gamma_Grad, scale);
      ScaleVector(Model.Layers[i].Norm2_Beta_Grad, scale);
    end;
  end;
end;

procedure ResetGradients(var Model: TTransformer);
var
  i, j: Integer;
begin
  WriteLn('ResetGradients: сброс всех градиентов');
  // Сброс градиентов эмбеддингов
  if Length(Model.Embedding_Grad) > 0 then
    FillMatrix(Model.Embedding_Grad, 0.0);
  // Сброс градиентов всех слоев
  for i := 0 to High(Model.Layers) do begin
    // Сброс градиентов FFN
    if Length(Model.Layers[i].FFN1_Grad) > 0 then
      FillMatrix(Model.Layers[i].FFN1_Grad, 0.0);
    if Length(Model.Layers[i].FFN2_Grad) > 0 then
      FillMatrix(Model.Layers[i].FFN2_Grad, 0.0);
    // Сброс градиентов LayerNorm
    if Length(Model.Layers[i].Norm1_Gamma_Grad) > 0 then
      FillArray(Model.Layers[i].Norm1_Gamma_Grad, 0.0);
    if Length(Model.Layers[i].Norm1_Beta_Grad) > 0 then
      FillArray(Model.Layers[i].Norm1_Beta_Grad, 0.0);
    if Length(Model.Layers[i].Norm2_Gamma_Grad) > 0 then
      FillArray(Model.Layers[i].Norm2_Gamma_Grad, 0.0);
    if Length(Model.Layers[i].Norm2_Beta_Grad) > 0 then
      FillArray(Model.Layers[i].Norm2_Beta_Grad, 0.0);
    // Сброс градиентов attention
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do begin
      if Length(Model.Layers[i].SelfAttention.Heads[j].dWq) > 0 then
        FillMatrix(Model.Layers[i].SelfAttention.Heads[j].dWq, 0.0);
      if Length(Model.Layers[i].SelfAttention.Heads[j].dWk) > 0 then
        FillMatrix(Model.Layers[i].SelfAttention.Heads[j].dWk, 0.0);
      if Length(Model.Layers[i].SelfAttention.Heads[j].dWv) > 0 then
        FillMatrix(Model.Layers[i].SelfAttention.Heads[j].dWv, 0.0);
      if Length(Model.Layers[i].SelfAttention.Heads[j].dWo) > 0 then
        FillMatrix(Model.Layers[i].SelfAttention.Heads[j].dWo, 0.0);
    end;
  end;
end;

// добавляем функцию проверки
procedure ValidateModel(var model: TTransformer);
var
  i,j: Integer;
begin
  WriteLn('=== ПРОВЕРКА МОДЕЛИ ===');

  // Проверка конфигурации
  with model.Config do begin
    WriteLn('Конфигурация:');
    WriteLn('  InputSize: ', InputSize);
    WriteLn('  NumLayers: ', NumLayers);
    WriteLn('  NumHeads: ', NumHeads);
    WriteLn('  FFNDim: ', FFNDim);
    WriteLn('  MaxSeqLength: ', MaxSeqLength);
    WriteLn('  DropoutRate: ', DropoutRate:0:2);

    // Исправляем проблемные значения
    if MaxSeqLength <= 0 then begin
      WriteLn('  ИСПРАВЛЯЕМ: MaxSeqLength <= 0, устанавливаем 100');
      MaxSeqLength := 100;
    end;

    if InputSize <= 0 then begin
      WriteLn('  ИСПРАВЛЯЕМ: InputSize <= 0, устанавливаем 300');
      InputSize := 300;
    end;
  end;

  // Проверка embedding матрицы
  WriteLn('Embedding матрица: ', Length(model.Embedding), 'x', IfThen(Length(model.Embedding) > 0, IntToStr(Length(model.Embedding[0])), '0'));

  if (Length(model.Embedding) = 0) or (Length(model.Embedding[0]) = 0) then begin
    WriteLn('  ИСПРАВЛЯЕМ: Переинициализируем embedding матрицу');
    SetLength(model.Embedding, model.Config.InputSize, model.Config.InputSize);
    for i := 0 to High(model.Embedding) do
      for j := 0 to High(model.Embedding[i]) do
        model.Embedding[i][j] := Random * 0.02 - 0.01;
  end;

  // Проверка слоев
  WriteLn('Количество слоев: ', Length(model.Layers));
  for i := 0 to High(model.Layers) do begin
    WriteLn('  Слой ', i, ':');
    WriteLn('    FFN1: ', Length(model.Layers[i].FFN_weights1), 'x', IfThen(Length(model.Layers[i].FFN_weights1) > 0, IntToStr(Length(model.Layers[i].FFN_weights1[0])), '0'));
  end;

  WriteLn('=== ПРОВЕРКА ЗАВЕРШЕНА ===');
end;

procedure FastLayerNorm(var X: TDoubleMatrix; const Gamma, Beta: TDoubleArray);
var
  i, j, n: Integer;
  mean, variance, inv_std, val: Double;
begin
  n := Length(X[0]);
  for i := 0 to High(X) do begin
    // Вычисляем среднее
    mean := 0.0;
    for j := 0 to n - 1 do
      mean := mean + X[i][j];
    mean := mean / n;

    // Вычисляем дисперсию
    variance := 0.0;
    for j := 0 to n - 1 do begin
      val := X[i][j] - mean;
      variance := variance + val * val;
    end;
    inv_std := 1.0 / Sqrt(variance / n + 1e-8);

    // Применяем нормализацию
    for j := 0 to n - 1 do X[i][j] := (X[i][j] - mean) * inv_std * Gamma[j] + Beta[j];
  end;
end;

// В Transformer.pas исправим CreatePositionalEncoding
function CreatePositionalEncoding(maxSeqLength, embeddingSize: Integer): TDoubleMatrix;
var
  i, j: Integer;
  angle: Double;
begin
  // ✅ ЗАЩИТА: Проверяем валидность параметров
  if (maxSeqLength <= 0) or (embeddingSize <= 0) then begin
    WriteLn('ОШИБКА: CreatePositionalEncoding с невалидными параметрами: ', maxSeqLength, 'x', embeddingSize);
    SetLength(Result, 0, 0);
    Exit;
  end;

  SetLength(Result, maxSeqLength, embeddingSize);

  for i := 0 to maxSeqLength - 1 do begin
    for j := 0 to embeddingSize - 1 do begin
      if (j and 1) = 0 then begin
        angle := i / Power(10000, j / embeddingSize);
        Result[i][j] := Sin(angle);
      end else begin
        angle := i / Power(10000, (j - 1) / embeddingSize);
        Result[i][j] := Cos(angle);
      end;
    end;
  end;
end;

procedure InitTransformerLayer(var Layer: TTransformerLayer; const Config: TTransformerConfig);
begin
  Layer.Config := Config;

  // Инициализация Self-Attention
  InitMultiHeadAttention(Layer.SelfAttention, Config.InputSize, Config.InputSize div Config.NumHeads, Config.NumHeads);

  // Инициализация FFN
  Layer.FFN_weights1 := CreateRandomMatrix(Config.InputSize, Config.FFNDim, -0.1, 0.1);
  Layer.FFN_weights2 := CreateRandomMatrix(Config.FFNDim, Config.InputSize, -0.1, 0.1);

  // Инициализация LayerNorm параметров
  SetLength(Layer.Norm1_Gamma, Config.InputSize);
  SetLength(Layer.Norm1_Beta, Config.InputSize);
  SetLength(Layer.Norm2_Gamma, Config.InputSize);
  SetLength(Layer.Norm2_Beta, Config.InputSize);

  FillArray(Layer.Norm1_Gamma, 1.0);
  FillArray(Layer.Norm1_Beta, 0.0);
  FillArray(Layer.Norm2_Gamma, 1.0);
  FillArray(Layer.Norm2_Beta, 0.0);

  // ✅ ИНИЦИАЛИЗИРУЕМ КЭШИ ПУСТЫМИ МАТРИЦАМИ
  SetLength(Layer.cachedInput, 0, 0);
  SetLength(Layer.cachedAttentionOutput, 0, 0);
  SetLength(Layer.cachedNorm1Output, 0, 0);
  SetLength(Layer.cachedFFNOutput, 0, 0);
  SetLength(Layer.cachedOutput, 0, 0);

  // ✅ ИНИЦИАЛИЗИРУЕМ ГРАДИЕНТЫ
  SetLength(Layer.Norm1_Gamma_Grad, Config.InputSize);
  SetLength(Layer.Norm1_Beta_Grad, Config.InputSize);
  SetLength(Layer.Norm2_Gamma_Grad, Config.InputSize);
  SetLength(Layer.Norm2_Beta_Grad, Config.InputSize);

  FillArray(Layer.Norm1_Gamma_Grad, 0.0);
  FillArray(Layer.Norm1_Beta_Grad, 0.0);
  FillArray(Layer.Norm2_Gamma_Grad, 0.0);
  FillArray(Layer.Norm2_Beta_Grad, 0.0);

  Layer.FFN_weights1_Grad := CreateZeroMatrix(Config.InputSize, Config.FFNDim);
  Layer.FFN_weights2_Grad := CreateZeroMatrix(Config.FFNDim, Config.InputSize);
end;

// В BackwardLayer исправим вызовы LayerNormBackward
procedure BackwardLayer(var Layer: TTransformerLayer;  const gradOutput: TDoubleMatrix; out gradInput: TDoubleMatrix);
var
  gradNorm2, gradResidual2, gradFFN, gradNorm1, gradResidual1, gradAttention: TDoubleMatrix;
  gradNorm2_Gamma, gradNorm2_Beta, gradNorm1_Gamma, gradNorm1_Beta: TDoubleArray;
begin
  WriteLn('    BackwardLayer: начат');
  WriteLn('      gradOutput: ', Length(gradOutput), 'x', Length(gradOutput[0]));

  try
    // ✅ ИСПРАВЛЕНИЕ: ИСПОЛЬЗУЕМ ПРАВИЛЬНЫЕ СИГНАТУРЫ

    // 1. Backward через LayerNorm 2
    WriteLn('      LayerNorm 2 backward...');
    if (Length(Layer.cachedOutput) > 0) and (Length(Layer.cachedNorm1Output) > 0) then begin
      gradNorm2 := LayerNormBackward(gradOutput, Layer.cachedOutput, Layer.cachedNorm1Output, Layer.Norm2_Gamma, gradNorm2_Gamma, gradNorm2_Beta);

      // Сохраняем градиенты
      Layer.Norm2_Gamma_Grad := AddVectors(Layer.Norm2_Gamma_Grad, gradNorm2_Gamma);
      Layer.Norm2_Beta_Grad := AddVectors(Layer.Norm2_Beta_Grad, gradNorm2_Beta);
    end else begin
      WriteLn('      Предупреждение: кэши пустые, используем упрощенный backward');
      gradNorm2 := CopyMatrix(gradOutput);
    end;

    // 2. Backward через residual connection 2
    WriteLn('      Residual 2 backward...');
    gradResidual2 := CopyMatrix(gradNorm2); // grad для residual
    gradFFN := CopyMatrix(gradNorm2);       // grad для FFN

    // 3. Backward через FFN
    WriteLn('      FFN backward...');
    if (Length(Layer.cachedNorm1Output) > 0) then begin
      gradNorm1 := FFNBackward(Layer, gradFFN, Layer.cachedNorm1Output);
    end else begin
      WriteLn('      Предупреждение: cachedNorm1Output пустой');
      gradNorm1 := CopyMatrix(gradFFN);
    end;

    // 4. Backward через LayerNorm 1
    WriteLn('      LayerNorm 1 backward...');
    if (Length(Layer.cachedNorm1Output) > 0) and (Length(Layer.cachedInput) > 0) then begin
      gradResidual1 := LayerNormBackward(gradNorm1, Layer.cachedNorm1Output, Layer.cachedInput, Layer.Norm1_Gamma, gradNorm1_Gamma, gradNorm1_Beta);

      // Сохраняем градиенты
      Layer.Norm1_Gamma_Grad := AddVectors(Layer.Norm1_Gamma_Grad, gradNorm1_Gamma);
      Layer.Norm1_Beta_Grad := AddVectors(Layer.Norm1_Beta_Grad, gradNorm1_Beta);
    end else begin
      WriteLn('      Предупреждение: кэши пустые, используем упрощенный backward');
      gradResidual1 := CopyMatrix(gradNorm1);
    end;

    // 5. Backward через residual connection 1
    WriteLn('      Residual 1 backward...');
    gradInput := CopyMatrix(gradResidual1);     // grad для предыдущего слоя
    gradAttention := CopyMatrix(gradResidual1); // grad для attention

    // 6. Backward через Self-Attention
    WriteLn('      Attention backward...');
    if (Length(Layer.cachedInput) > 0) then begin
      AttentionBackward(Layer.SelfAttention, gradAttention, Layer.cachedInput);
    end else begin
      WriteLn('      Предупреждение: cachedInput пустой');
    end;

    WriteLn('    BackwardLayer: завершен успешно');

  except
    on E: Exception do begin
      WriteLn('ОШИБКА в BackwardLayer: ', E.Message);
Halt;
      gradInput := CopyMatrix(gradOutput);
    end;
  end;
end;

procedure InitMultiHeadAttention(var Attention: TMultiHeadAttention; inputSize, headSize, numHeads: Integer);
var
  i: Integer;
  totalHeadSize: Integer;
begin
  WriteLn('InitializeMultiHeadAttention:');
  WriteLn('  inputSize: ', inputSize);
  WriteLn('  headSize: ', headSize);
  WriteLn('  numHeads: ', numHeads);

  // ✅ ВЫЧИСЛЯЕМ ОБЩИЙ РАЗМЕР ВСЕХ ГОЛОВ
  totalHeadSize := headSize * numHeads;
  WriteLn('  totalHeadSize: ', totalHeadSize);

  SetLength(Attention.Heads, numHeads);

  for i := 0 to numHeads - 1 do 
  begin
    WriteLn('  Инициализация головы ', i, ':');

    // Wq, Wk, Wv: inputSize x headSize (правильно)
    InitializeAttentionHead(Attention.Heads[i], inputSize, headSize);

    // ✅ ИСПРАВЛЯЕМ Wo: ДОЛЖНА БЫТЬ totalHeadSize x inputSize
    // Но каждая голова имеет свою Wo размером headSize x inputSize
    // После конкатенации: (batch_size x totalHeadSize) × (totalHeadSize x inputSize) = (batch_size x inputSize)
    WriteLn('    Wo размер: ', Length(Attention.Heads[i].Wo), 'x', 
            IfThen(Length(Attention.Heads[i].Wo) > 0, IntToStr(Length(Attention.Heads[i].Wo[0])), '0'));
  end;

  Attention.HeadSize := headSize;
  Attention.NumHeads := numHeads;
end;

// В Transformer.pas - обновленный ForwardTransformerOptimized
procedure ForwardTransformer(var Model: TTransformer; const Input: TDoubleMatrix; out Output: TDoubleMatrix; Mask: TDoubleMatrix; isTraining: Boolean);
var
  i: Integer;
  currentOutput, layerOutput: TDoubleMatrix; // ✅ ОТДЕЛЬНАЯ ПЕРЕМЕННАЯ ДЛЯ ВЫВОДА
begin
  WriteLn('ForwardTransformerOptimized: начат');
  WriteLn('  Input указатель: ', PtrUInt(@Input));
  WriteLn('  Input: ', Length(Input), 'x', IfThen(Length(Input) > 0, IntToStr(Length(Input[0])), '0'));

  // ✅ ЗАЩИТА: Проверяем входные данные
  if (Length(Input) = 0) or (Length(Input[0]) = 0) then begin
    WriteLn('ОШИБКА: Пустой вход в ForwardTransformerOptimized');
    SetLength(Output, 0, 0);
    Exit;
  end;

  // ✅ ПРОВЕРКА МОДЕЛИ ПЕРЕД ИСПОЛЬЗОВАНИЕМ
  if not ValidateModelStructure(Model) then begin
    WriteLn('ОШИБКА: Модель не прошла валидацию');
    SetLength(Output, 0, 0);
    Exit;
  end;

  // ✅ ЗАЩИТА: Проверяем размерность модели
  if Length(Input[0]) <> Model.Config.InputSize then begin
    WriteLn('ОШИБКА: Несовпадение размерности входных данных: ', Length(Input[0]), ' != ', Model.Config.InputSize);
    SetLength(Output, 0, 0);
    Exit;
  end;

  try
    // Применяем embedding
    WriteLn('  Вызов ApplyEmbedding...');
    currentOutput := ApplyEmbedding(Model, Input);
    WriteLn('  После embedding: ', Length(currentOutput), 'x', IfThen(Length(currentOutput) > 0, IntToStr(Length(currentOutput[0])), '0'));

    // ✅ ЗАЩИТА: Проверяем что embedding сработал
    if (Length(currentOutput) = 0) or (Length(currentOutput[0]) = 0) then begin
      WriteLn('ОШИБКА: Пустой вывод после embedding');
      SetLength(Output, 0, 0);
      Exit;
    end;

    // Добавляем позиционное кодирование
    WriteLn('  Добавление позиционного кодирования...');
    currentOutput := AddPositionalEncodingToSequenceSafe(currentOutput, Model.Config.MaxSeqLength);
    WriteLn('  После positional encoding: ', Length(currentOutput), 'x', IfThen(Length(currentOutput) > 0, IntToStr(Length(currentOutput[0])), '0'));

    // ✅ ЗАЩИТА: Проверяем после positional encoding
    if (Length(currentOutput) = 0) or (Length(currentOutput[0]) = 0) then begin
      WriteLn('ОШИБКА: Пустой вывод после positional encoding');
      SetLength(Output, 0, 0);
      Exit;
    end;

    WriteLn('  currentOutput указатель перед слоями: ', PtrUInt(@currentOutput));

    // Проходим через все слои
    for i := 0 to High(Model.Layers) do begin
      WriteLn('  Слой ', i, ':');
      WriteLn('    currentOutput указатель: ', PtrUInt(@currentOutput));
      WriteLn('    currentOutput: ', Length(currentOutput), 'x', IfThen(Length(currentOutput) > 0, IntToStr(Length(currentOutput[0])), '0'));

      // ✅ УГЛУБЛЕННАЯ ПРОВЕРКА ПЕРЕД ВЫЗОВОМ
      if (Length(currentOutput) = 0) then begin
        WriteLn('    КРИТИЧЕСКАЯ ОШИБКА: currentOutput имеет длину 0 перед вызовом');
        Break;
      end;

      if (Length(currentOutput[0]) = 0) then begin
        WriteLn('    КРИТИЧЕСКАЯ ОШИБКА: currentOutput[0] имеет длину 0 перед вызовом');
        Break;
      end;

      WriteLn('    Вызов ForwardLayerOptimized...');

      // ✅ ИСПРАВЛЕНИЕ: Используем ОТДЕЛЬНУЮ переменную для вывода
      SetLength(layerOutput, 0, 0);
      ForwardLayerOptimized(Model.Layers[i], currentOutput, layerOutput, Mask, isTraining);

      WriteLn('    layerOutput после вызова: ', Length(layerOutput), 'x', IfThen(Length(layerOutput) > 0, IntToStr(Length(layerOutput[0])), '0'));

      // ✅ ИСПРАВЛЕНИЕ: Присваиваем currentOutput ТОЛЬКО после успешного вызова
      if (Length(layerOutput) > 0) and (Length(layerOutput[0]) > 0) then begin
        currentOutput := layerOutput;
      end else begin
        WriteLn('    ОШИБКА: Пустой вывод из ForwardLayerOptimized');
        Break;
      end;

      WriteLn('    После слоя: ', Length(currentOutput), 'x', IfThen(Length(currentOutput) > 0, IntToStr(Length(currentOutput[0])), '0'));
    end;

    Output := currentOutput;
    WriteLn('ForwardTransformerOptimized: завершен успешно');
    WriteLn('  Output: ', Length(Output), 'x', IfThen(Length(Output) > 0, IntToStr(Length(Output[0])), '0'));

  except
    on E: Exception do begin
      WriteLn('  КРИТИЧЕСКАЯ ОШИБКА в ForwardTransformerOptimized: ', E.Message);
      WriteLn('    Тип ошибки: ', E.ClassName);
      SetLength(Output, 0, 0);
    end;
  end;
end;

function ValidateModelStructure(const Model: TTransformer): Boolean;
var
  i, j: Integer;
begin
  WriteLn('=== ПРОВЕРКА СТРУКТУРЫ МОДЕЛИ ===');
  Result := True;

  // Проверка embedding
  if (Length(Model.Embedding) = 0) or (Length(Model.Embedding[0]) = 0) then
  begin
    WriteLn('ОШИБКА: Embedding матрица пустая');
    Result := False;
  end;

  // Проверка слоев
  if Length(Model.Layers) <> Model.Config.NumLayers then
  begin
    WriteLn('ОШИБКА: Количество слоев не совпадает с конфигурацией');
    Result := False;
  end;

  for i := 0 to High(Model.Layers) do
  begin
    // Проверка FFN весов
    if (Length(Model.Layers[i].FFN_weights1) = 0) or 
       (Length(Model.Layers[i].FFN_weights1[0]) = 0) then
    begin
      WriteLn('ОШИБКА: FFN_weights1 слоя ', i, ' пустые');
      Result := False;
    end;

    if (Length(Model.Layers[i].FFN_weights2) = 0) or 
       (Length(Model.Layers[i].FFN_weights2[0]) = 0) then
    begin
      WriteLn('ОШИБКА: FFN_weights2 слоя ', i, ' пустые');
      Result := False;
    end;

    // Проверка attention голов
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
    begin
      with Model.Layers[i].SelfAttention.Heads[j] do
      begin
        if (Length(Wq) = 0) or (Length(Wk) = 0) or (Length(Wv) = 0) or (Length(Wo) = 0) then
        begin
          WriteLn('ОШИБКА: Веса головы ', j, ' слоя ', i, ' пустые');
          Result := False;
        end;
      end;
    end;
  end;

  if Result then
    WriteLn('Структура модели валидна')
  else
    WriteLn('Обнаружены проблемы в структуре модели');
end;

procedure InitializeAllGradients(var Model: TTransformer);
var
  i, j: Integer;
begin
  WriteLn('InitializeAllGradients: принудительная инициализация всех градиентов');

  // Инициализация embedding градиентов
  if Length(Model.Embedding_Grad) = 0 then
  begin
    SetLength(Model.Embedding_Grad, Length(Model.Embedding), Length(Model.Embedding[0]));
    FillMatrix(Model.Embedding_Grad, 0.0);
  end;

  // Инициализация градиентов всех слоев
  for i := 0 to High(Model.Layers) do
  begin
    // FFN градиенты
    if Length(Model.Layers[i].FFN1_Grad) = 0 then
    begin
      SetLength(Model.Layers[i].FFN1_Grad, Length(Model.Layers[i].FFN_weights1), Length(Model.Layers[i].FFN_weights1[0]));
      FillMatrix(Model.Layers[i].FFN1_Grad, 0.0);
    end;

    if Length(Model.Layers[i].FFN2_Grad) = 0 then
    begin
      SetLength(Model.Layers[i].FFN2_Grad, Length(Model.Layers[i].FFN_weights2), Length(Model.Layers[i].FFN_weights2[0]));
      FillMatrix(Model.Layers[i].FFN2_Grad, 0.0);
    end;

    // Attention градиенты
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
    begin
      with Model.Layers[i].SelfAttention.Heads[j] do
      begin
        if Length(dWq) = 0 then
        begin
          SetLength(dWq, Length(Wq), Length(Wq[0]));
          FillMatrix(dWq, 0.0);
        end;
        if Length(dWk) = 0 then
        begin
          SetLength(dWk, Length(Wk), Length(Wk[0]));
          FillMatrix(dWk, 0.0);
        end;
        if Length(dWv) = 0 then
        begin
          SetLength(dWv, Length(Wv), Length(Wv[0]));
          FillMatrix(dWv, 0.0);
        end;
        if Length(dWo) = 0 then
        begin
          SetLength(dWo, Length(Wo), Length(Wo[0]));
          FillMatrix(dWo, 0.0);
        end;
      end;
    end;

// ✅ ИНИЦИАЛИЗИРУЕМ ГРАДИЕНТЫ LAYERNORM ЕСЛИ ОНИ ПУСТЫЕ
if Length(Model.Layers[i].Norm1_Gamma_Grad) = 0 then
begin
  SetLength(Model.Layers[i].Norm1_Gamma_Grad, Length(Model.Layers[i].Norm1_Gamma));
  FillArray(Model.Layers[i].Norm1_Gamma_Grad, 0.0);
end;

if Length(Model.Layers[i].Norm1_Beta_Grad) = 0 then
begin
  SetLength(Model.Layers[i].Norm1_Beta_Grad, Length(Model.Layers[i].Norm1_Beta));
  FillArray(Model.Layers[i].Norm1_Beta_Grad, 0.0);
end;

if Length(Model.Layers[i].Norm2_Gamma_Grad) = 0 then
begin
  SetLength(Model.Layers[i].Norm2_Gamma_Grad, Length(Model.Layers[i].Norm2_Gamma));
  FillArray(Model.Layers[i].Norm2_Gamma_Grad, 0.0);
end;

if Length(Model.Layers[i].Norm2_Beta_Grad) = 0 then
begin
  SetLength(Model.Layers[i].Norm2_Beta_Grad, Length(Model.Layers[i].Norm2_Beta));
  FillArray(Model.Layers[i].Norm2_Beta_Grad, 0.0);
end;

  end;

  WriteLn('InitializeAllGradients: завершено');
end;

function ApplyOutputProjection(const concatOutput: TDoubleMatrix; const Heads: array of TAttentionHead): TDoubleMatrix;
var
  i, j, k, offset: Integer;
  totalHeadSize, inputSize: Integer;
  Wo_combined: TDoubleMatrix;
begin
  WriteLn('    ApplyOutputProjection: начат');
  WriteLn('      concatOutput: ', Length(concatOutput), 'x', Length(concatOutput[0]));

  if Length(Heads) = 0 then
  begin
    WriteLn('      ⚠ Нет голов внимания');
    Result := CopyMatrix(concatOutput);
    Exit;
  end;

  // ✅ ВЫЧИСЛЯЕМ ОЖИДАЕМЫЕ РАЗМЕРЫ
  totalHeadSize := 0;
  for i := 0 to High(Heads) do
  begin
    if (Length(Heads[i].Wo) > 0) and (Length(Heads[i].Wo[0]) > 0) then
    begin
      totalHeadSize := totalHeadSize + Length(Heads[i].Wo);
    end;
  end;

  inputSize := Length(Heads[0].Wo[0]);

  WriteLn('      totalHeadSize: ', totalHeadSize, ', inputSize: ', inputSize);
  WriteLn('      concatOutput cols: ', Length(concatOutput[0]));

  // ✅ ПРОВЕРЯЕМ СОВМЕСТИМОСТЬ РАЗМЕРОВ
  if Length(concatOutput[0]) = totalHeadSize then
  begin
    // Создаем объединенную матрицу Wo размером (totalHeadSize x inputSize)
    SetLength(Wo_combined, totalHeadSize, inputSize);

    // Заполняем объединенную матрицу из всех голов
    offset := 0;
    for i := 0 to High(Heads) do
    begin
      if (Length(Heads[i].Wo) > 0) and (Length(Heads[i].Wo[0]) > 0) then
      begin
        for j := 0 to High(Heads[i].Wo) do
        begin
          for k := 0 to High(Heads[i].Wo[0]) do
          begin
            Wo_combined[offset + j][k] := Heads[i].Wo[j][k];
          end;
        end;
        Inc(offset, Length(Heads[i].Wo));
      end;
    end;

    Result := MatrixMultiply(concatOutput, Wo_combined);
    WriteLn('      ✓ Применена объединенная выходная проекция');
    WriteLn('      Result: ', Length(Result), 'x', Length(Result[0]));
  end
  else if (Length(Heads[0].Wo) > 0) and (Length(concatOutput[0]) = Length(Heads[0].Wo)) then
  begin
    // Используем Wo из первой головы (упрощенный вариант)
    Result := MatrixMultiply(concatOutput, Heads[0].Wo);
    WriteLn('      ✓ Применена выходная проекция Wo[0]');
    WriteLn('      Result: ', Length(Result), 'x', Length(Result[0]));
  end
  else
  begin
    WriteLn('      ⚠ Несовпадение размеров, пропускаем проекцию');
    WriteLn('        concatOutput cols: ', Length(concatOutput[0]));
    WriteLn('        totalHeadSize: ', totalHeadSize);
    WriteLn('        Wo[0] rows: ', Length(Heads[0].Wo));
    Result := CopyMatrix(concatOutput);
  end;

  WriteLn('    ApplyOutputProjection: завершен');
end;

end.