unit GradientAccumulator;
{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  SysUtils, Transformer, MatrixOps, DataUtils;

type
  TAccumulatedGradients = record
    EmbeddingGrad: TDoubleMatrix;
    LayerGradients: array of record
      FFN1_Grad: TDoubleMatrix;
      FFN2_Grad: TDoubleMatrix;
      Norm1_Gamma_Grad: TDoubleArray;
      Norm1_Beta_Grad: TDoubleArray;
      Norm2_Gamma_Grad: TDoubleArray;
      Norm2_Beta_Grad: TDoubleArray;
      AttentionGradients: array of record
        dWq: TDoubleMatrix;
        dWk: TDoubleMatrix;
        dWv: TDoubleMatrix;
        dWo: TDoubleMatrix;
      end;
    end;
  end;

procedure InitAccumulatedGradients(var Model: TTransformer; 
                                 var Accumulated: TAccumulatedGradients);
procedure AccumulateGradients(var Model: TTransformer;
                            const CurrentGradOutput: TDoubleMatrix;
                            var Accumulated: TAccumulatedGradients);
procedure ApplyAccumulatedGradients(var Model: TTransformer;
                                  const Accumulated: TAccumulatedGradients;
                                  EffectiveLearningRate: Double);
procedure FreeAccumulatedGradients(var Accumulated: TAccumulatedGradients);

implementation

procedure InitAccumulatedGradients(var Model: TTransformer; 
                                 var Accumulated: TAccumulatedGradients);
var
  i, j: Integer;
begin
  WriteLn('Инициализация накопленных градиентов...');

  // Инициализируем градиенты эмбеддингов
  if Length(Model.Embedding) > 0 then
  begin
    SetLength(Accumulated.EmbeddingGrad, 
              Length(Model.Embedding), 
              Length(Model.Embedding[0]));
    FillMatrix(Accumulated.EmbeddingGrad, 0.0);
  end;

  // Инициализируем градиенты слоев
  SetLength(Accumulated.LayerGradients, Length(Model.Layers));

  for i := 0 to High(Model.Layers) do
  begin
    // FFN градиенты
    if Length(Model.Layers[i].FFN_weights1) > 0 then
    begin
      SetLength(Accumulated.LayerGradients[i].FFN1_Grad,
                Length(Model.Layers[i].FFN_weights1),
                Length(Model.Layers[i].FFN_weights1[0]));
      FillMatrix(Accumulated.LayerGradients[i].FFN1_Grad, 0.0);
    end;

    if Length(Model.Layers[i].FFN_weights2) > 0 then
    begin
      SetLength(Accumulated.LayerGradients[i].FFN2_Grad,
                Length(Model.Layers[i].FFN_weights2),
                Length(Model.Layers[i].FFN_weights2[0]));
      FillMatrix(Accumulated.LayerGradients[i].FFN2_Grad, 0.0);
    end;

    // LayerNorm градиенты
    if Length(Model.Layers[i].Norm1_Gamma) > 0 then
    begin
      SetLength(Accumulated.LayerGradients[i].Norm1_Gamma_Grad,
                Length(Model.Layers[i].Norm1_Gamma));
      FillArray(Accumulated.LayerGradients[i].Norm1_Gamma_Grad, 0.0);
    end;

    if Length(Model.Layers[i].Norm1_Beta) > 0 then
    begin
      SetLength(Accumulated.LayerGradients[i].Norm1_Beta_Grad,
                Length(Model.Layers[i].Norm1_Beta));
      FillArray(Accumulated.LayerGradients[i].Norm1_Beta_Grad, 0.0);
    end;

    if Length(Model.Layers[i].Norm2_Gamma) > 0 then
    begin
      SetLength(Accumulated.LayerGradients[i].Norm2_Gamma_Grad,
                Length(Model.Layers[i].Norm2_Gamma));
      FillArray(Accumulated.LayerGradients[i].Norm2_Gamma_Grad, 0.0);
    end;

    if Length(Model.Layers[i].Norm2_Beta) > 0 then
    begin
      SetLength(Accumulated.LayerGradients[i].Norm2_Beta_Grad,
                Length(Model.Layers[i].Norm2_Beta));
      FillArray(Accumulated.LayerGradients[i].Norm2_Beta_Grad, 0.0);
    end;

    // Attention градиенты
    SetLength(Accumulated.LayerGradients[i].AttentionGradients,
              Length(Model.Layers[i].SelfAttention.Heads));

    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
    begin
      if Length(Model.Layers[i].SelfAttention.Heads[j].Wq) > 0 then
      begin
        SetLength(Accumulated.LayerGradients[i].AttentionGradients[j].dWq,
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wq),
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wq[0]));
        FillMatrix(Accumulated.LayerGradients[i].AttentionGradients[j].dWq, 0.0);
      end;

      if Length(Model.Layers[i].SelfAttention.Heads[j].Wk) > 0 then
      begin
        SetLength(Accumulated.LayerGradients[i].AttentionGradients[j].dWk,
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wk),
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wk[0]));
        FillMatrix(Accumulated.LayerGradients[i].AttentionGradients[j].dWk, 0.0);
      end;

      if Length(Model.Layers[i].SelfAttention.Heads[j].Wv) > 0 then
      begin
        SetLength(Accumulated.LayerGradients[i].AttentionGradients[j].dWv,
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wv),
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wv[0]));
        FillMatrix(Accumulated.LayerGradients[i].AttentionGradients[j].dWv, 0.0);
      end;

      if Length(Model.Layers[i].SelfAttention.Heads[j].Wo) > 0 then
      begin
        SetLength(Accumulated.LayerGradients[i].AttentionGradients[j].dWo,
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wo),
                  Length(Model.Layers[i].SelfAttention.Heads[j].Wo[0]));
        FillMatrix(Accumulated.LayerGradients[i].AttentionGradients[j].dWo, 0.0);
      end;
    end;
  end;
end;

procedure AccumulateGradients(var Model: TTransformer;
                            const CurrentGradOutput: TDoubleMatrix;
                            var Accumulated: TAccumulatedGradients);
var
  i,j,k,l: Integer;
begin
  // Накопление градиентов эмбеддингов
  if (Length(Model.Embedding_Grad) > 0) and 
     (Length(Accumulated.EmbeddingGrad) > 0) then
  begin
    for i := 0 to High(Model.Embedding_Grad) do
      for j := 0 to High(Model.Embedding_Grad[0]) do
        Accumulated.EmbeddingGrad[i][j] := Accumulated.EmbeddingGrad[i][j] + 
                                         Model.Embedding_Grad[i][j];
  end;

  // Накопление градиентов слоев
  for i := 0 to High(Model.Layers) do
  begin
    // FFN градиенты
    if (Length(Model.Layers[i].FFN1_Grad) > 0) and
       (Length(Accumulated.LayerGradients[i].FFN1_Grad) > 0) then
    begin
      for k := 0 to High(Model.Layers[i].FFN1_Grad) do
        for l := 0 to High(Model.Layers[i].FFN1_Grad[0]) do
          Accumulated.LayerGradients[i].FFN1_Grad[k][l] := 
            Accumulated.LayerGradients[i].FFN1_Grad[k][l] + 
            Model.Layers[i].FFN1_Grad[k][l];
    end;

    if (Length(Model.Layers[i].FFN2_Grad) > 0) and
       (Length(Accumulated.LayerGradients[i].FFN2_Grad) > 0) then
    begin
      for k := 0 to High(Model.Layers[i].FFN2_Grad) do
        for l := 0 to High(Model.Layers[i].FFN2_Grad[0]) do
          Accumulated.LayerGradients[i].FFN2_Grad[k][l] := 
            Accumulated.LayerGradients[i].FFN2_Grad[k][l] + 
            Model.Layers[i].FFN2_Grad[k][l];
    end;

    // LayerNorm градиенты
    if (Length(Model.Layers[i].Norm1_Gamma_Grad) > 0) and
       (Length(Accumulated.LayerGradients[i].Norm1_Gamma_Grad) > 0) then
    begin
      for k := 0 to High(Model.Layers[i].Norm1_Gamma_Grad) do
        Accumulated.LayerGradients[i].Norm1_Gamma_Grad[k] := 
          Accumulated.LayerGradients[i].Norm1_Gamma_Grad[k] + 
          Model.Layers[i].Norm1_Gamma_Grad[k];
    end;

    // Attention градиенты
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
    begin
      if (Length(Model.Layers[i].SelfAttention.Heads[j].dWq) > 0) and
         (Length(Accumulated.LayerGradients[i].AttentionGradients[j].dWq) > 0) then
      begin
        for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWq) do
          for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWq[0]) do
            Accumulated.LayerGradients[i].AttentionGradients[j].dWq[k][l] := 
              Accumulated.LayerGradients[i].AttentionGradients[j].dWq[k][l] + 
              Model.Layers[i].SelfAttention.Heads[j].dWq[k][l];
      end;

      // Аналогично для dWk, dWv, dWo...
      if (Length(Model.Layers[i].SelfAttention.Heads[j].dWk) > 0) and
         (Length(Accumulated.LayerGradients[i].AttentionGradients[j].dWk) > 0) then
      begin
        for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWk) do
          for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWk[0]) do
            Accumulated.LayerGradients[i].AttentionGradients[j].dWk[k][l] := 
              Accumulated.LayerGradients[i].AttentionGradients[j].dWk[k][l] + 
              Model.Layers[i].SelfAttention.Heads[j].dWk[k][l];
      end;
      if (Length(Model.Layers[i].SelfAttention.Heads[j].dWv) > 0) and
         (Length(Accumulated.LayerGradients[i].AttentionGradients[j].dWv) > 0) then
      begin
        for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWv) do
          for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWv[0]) do
            Accumulated.LayerGradients[i].AttentionGradients[j].dWv[k][l] := 
              Accumulated.LayerGradients[i].AttentionGradients[j].dWv[k][l] + 
              Model.Layers[i].SelfAttention.Heads[j].dWv[k][l];
      end;
      if (Length(Model.Layers[i].SelfAttention.Heads[j].dWo) > 0) and
         (Length(Accumulated.LayerGradients[i].AttentionGradients[j].dWo) > 0) then
      begin
        for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWo) do
          for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].dWo[0]) do
            Accumulated.LayerGradients[i].AttentionGradients[j].dWo[k][l] := 
              Accumulated.LayerGradients[i].AttentionGradients[j].dWo[k][l] + 
              Model.Layers[i].SelfAttention.Heads[j].dWo[k][l];
      end;



    end;
  end;
end;

procedure ApplyAccumulatedGradients(var Model: TTransformer;
                                  const Accumulated: TAccumulatedGradients;
                                  EffectiveLearningRate: Double);
var
  i,j,k,l: Integer;
begin
  WriteLn('Применение накопленных градиентов (LR=', EffectiveLearningRate:0:6, ')');

  // Применяем градиенты к эмбеддингам
  if Length(Accumulated.EmbeddingGrad) > 0 then
  begin
    for i := 0 to High(Model.Embedding) do
      for j := 0 to High(Model.Embedding[0]) do
        Model.Embedding[i][j] := Model.Embedding[i][j] - 
                               EffectiveLearningRate * Accumulated.EmbeddingGrad[i][j];
  end;

  // Применяем градиенты к слоям
  for i := 0 to High(Model.Layers) do
  begin
    // FFN weights
    if Length(Accumulated.LayerGradients[i].FFN1_Grad) > 0 then
    begin
      for k := 0 to High(Model.Layers[i].FFN_weights1) do
        for l := 0 to High(Model.Layers[i].FFN_weights1[0]) do
          Model.Layers[i].FFN_weights1[k][l] := Model.Layers[i].FFN_weights1[k][l] - 
                                              EffectiveLearningRate * 
                                              Accumulated.LayerGradients[i].FFN1_Grad[k][l];
    end;

    // LayerNorm parameters
    if Length(Accumulated.LayerGradients[i].Norm1_Gamma_Grad) > 0 then
    begin
      for k := 0 to High(Model.Layers[i].Norm1_Gamma) do
        Model.Layers[i].Norm1_Gamma[k] := Model.Layers[i].Norm1_Gamma[k] - 
                                        EffectiveLearningRate * 
                                        Accumulated.LayerGradients[i].Norm1_Gamma_Grad[k];
    end;

    // Attention weights
    for j := 0 to High(Model.Layers[i].SelfAttention.Heads) do
    begin
      if Length(Accumulated.LayerGradients[i].AttentionGradients[j].dWq) > 0 then
      begin
        for k := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wq) do
          for l := 0 to High(Model.Layers[i].SelfAttention.Heads[j].Wq[0]) do
            Model.Layers[i].SelfAttention.Heads[j].Wq[k][l] := 
              Model.Layers[i].SelfAttention.Heads[j].Wq[k][l] - 
              EffectiveLearningRate * 
              Accumulated.LayerGradients[i].AttentionGradients[j].dWq[k][l];
      end;
    end;
  end;
end;

procedure FreeAccumulatedGradients(var Accumulated: TAccumulatedGradients);
var i, j: Integer;
begin
  SetLength(Accumulated.EmbeddingGrad, 0);
  for i := 0 to High(Accumulated.LayerGradients) do begin
    with Accumulated.LayerGradients[i] do begin
    SetLength(FFN1_Grad, 0);
    SetLength(FFN2_Grad, 0);
    SetLength(Norm1_Gamma_Grad, 0);
    SetLength(Norm1_Beta_Grad, 0);
    SetLength(Norm2_Gamma_Grad, 0);
    SetLength(Norm2_Beta_Grad, 0);
    end;
    for j := 0 to High(Accumulated.LayerGradients[i].AttentionGradients) do begin
      with Accumulated.LayerGradients[i].AttentionGradients[j] do begin
      SetLength(dWq, 0);
      SetLength(dWk, 0);
      SetLength(dWv, 0);
      SetLength(dWo, 0);
      end;
    end;
    SetLength(Accumulated.LayerGradients[i].AttentionGradients, 0);
  end;
  SetLength(Accumulated.LayerGradients, 0);
end;

end.