unit LSTM;
{$MODE OBJFPC}{$H+}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses SysUtils,DataUtils,Math, MatrixOps{, NeuralNetwork};

type
  TLSTMCell = record
    // Весовые матрицы
    Wf, Wi, Wc, Wo: TDoubleMatrix; // Веса для входов
    Uf, Ui, Uc, Uo: TDoubleMatrix; // Веса для состояний
    bf, bi, bc, bo: TDoubleArray;  // Смещения
    
    // Состояния и активации
    f, i, o, g: TDoubleArray;     // Gate activations
    c, c_prev: TDoubleArray;      // Cell states
    h, h_prev: TDoubleArray;      // Hidden states
  end;

  TLSTMLayer = record
    Cells: array of TLSTMCell;
    InputSize, HiddenSize: Integer;
  end;

  TLSTMGradients = record
    dWf, dWi, dWc, dWo: TDoubleMatrix;
    dUf, dUi, dUc, dUo: TDoubleMatrix;
    dbf, dbi, dbc, dbo: TDoubleArray;
    dh_next, dc_next: TDoubleArray;
  end;

procedure InitializeLSTMLayer(var layer: TLSTMLayer; inputSize, hiddenSize: Integer);
procedure LSTMCellForward(var cell: TLSTMCell; const x, h_prev, c_prev: TDoubleArray);
procedure LSTMLayerForward(var layer: TLSTMLayer; const inputSequence: TDoubleMatrix);
procedure TrainLSTM(var layer: TLSTMLayer; 
                  const inputs: TDoubleMatrix;
                  const targets: TDoubleArray;
                  learningRate: Double;
                  epochs: Integer);
procedure ComputeGradients(var cell: TLSTMCell;
                          const x: TDoubleArray;
                          var grad, grad_next: TLSTMGradients);
function CalculateLoss(const predictions, targets: TDoubleArray): Double;

implementation

procedure InitializeLSTMLayer(var layer: TLSTMLayer; inputSize, hiddenSize: Integer);
begin
  layer.InputSize := inputSize;
  layer.HiddenSize := hiddenSize;
  SetLength(layer.Cells, 1);
  
  // Инициализация весовых матриц
  with layer.Cells[0] do
  begin
    Wf := RandomMatrix(hiddenSize, inputSize);
    Wi := RandomMatrix(hiddenSize, inputSize);
    Wc := RandomMatrix(hiddenSize, inputSize);
    Wo := RandomMatrix(hiddenSize, inputSize);
    
    Uf := RandomMatrix(hiddenSize, hiddenSize);
    Ui := RandomMatrix(hiddenSize, hiddenSize);
    Uc := RandomMatrix(hiddenSize, hiddenSize);
    Uo := RandomMatrix(hiddenSize, hiddenSize);
    
    bf := RandomArray(hiddenSize);
    bi := RandomArray(hiddenSize);
    bc := RandomArray(hiddenSize);
    bo := RandomArray(hiddenSize);
  end;
end;

procedure LSTMCellForward(var cell: TLSTMCell; const x, h_prev, c_prev: TDoubleArray);
begin
  // Сохраняем предыдущие состояния
  cell.h_prev := h_prev;
  cell.c_prev := c_prev;
  
  // Forget gate
  cell.f := SigmoidVector(AddVectors(
    AddVectors(MatrixVectorMultiply(cell.Wf, x),
               MatrixVectorMultiply(cell.Uf, h_prev)),
    cell.bf));
  
  // Input gate
  cell.i := SigmoidVector(AddVectors(
    AddVectors(MatrixVectorMultiply(cell.Wi, x),
               MatrixVectorMultiply(cell.Ui, h_prev)),
    cell.bi));
  
  // Output gate
  cell.o := SigmoidVector(AddVectors(
    AddVectors(MatrixVectorMultiply(cell.Wo, x),
               MatrixVectorMultiply(cell.Uo, h_prev)),
    cell.bo));
  
  // Candidate cell state
  cell.g := TanhVector(AddVectors(
    AddVectors(MatrixVectorMultiply(cell.Wc, x),
               MatrixVectorMultiply(cell.Uc, h_prev)),
    cell.bc));
  
  // Новое cell state
  cell.c := AddVectors(
    MultiplyVectors(cell.f, c_prev),
    MultiplyVectors(cell.i, cell.g));
  
  // Новый hidden state
  cell.h := MultiplyVectors(cell.o, TanhVector(cell.c));
end;

procedure LSTMLayerForward(var layer: TLSTMLayer; const inputSequence: TDoubleMatrix);
var
  t: Integer;
begin
  if Length(layer.Cells) = 0 then
    raise Exception.Create('LSTM layer not initialized');

  // Обрабатываем последовательность по шагам времени
  for t := 0 to High(inputSequence) do
  begin
    // Для первого шага используем нулевые предыдущие состояния
    if t = 0 then
    begin
      SetLength(layer.Cells[0].h, layer.HiddenSize);
      SetLength(layer.Cells[0].c, layer.HiddenSize);
      FillChar(layer.Cells[0].h[0], layer.HiddenSize * SizeOf(Double), 0);
      FillChar(layer.Cells[0].c[0], layer.HiddenSize * SizeOf(Double), 0);
    end;

    // Прямой проход через ячейку LSTM
    LSTMCellForward(layer.Cells[0], inputSequence[t], 
                   layer.Cells[0].h, layer.Cells[0].c);
  end;
end;

procedure ZeroGradients(var grad: TLSTMGradients; hiddenSize, inputSize: Integer);
begin
  grad.dWf := CreateZeroMatrix(hiddenSize, inputSize);
  grad.dWi := CreateZeroMatrix(hiddenSize, inputSize);
  grad.dWc := CreateZeroMatrix(hiddenSize, inputSize);
  grad.dWo := CreateZeroMatrix(hiddenSize, inputSize);
  
  grad.dUf := CreateZeroMatrix(hiddenSize, hiddenSize);
  grad.dUi := CreateZeroMatrix(hiddenSize, hiddenSize);
  grad.dUc := CreateZeroMatrix(hiddenSize, hiddenSize);
  grad.dUo := CreateZeroMatrix(hiddenSize, hiddenSize);
  
  SetLength(grad.dbf, hiddenSize);
  SetLength(grad.dbi, hiddenSize);
  SetLength(grad.dbc, hiddenSize);
  SetLength(grad.dbo, hiddenSize);
  
  SetLength(grad.dh_next, hiddenSize);
  SetLength(grad.dc_next, hiddenSize);
end;

procedure ClipGradients(var grad: TLSTMGradients; maxNorm: Double);
var
  norm, scale: Double;
  
  procedure ProcessMatrix(const m: TDoubleMatrix);
  var
    i, j: Integer;
  begin
    for i := 0 to High(m) do
      for j := 0 to High(m[i]) do
        norm := norm + Sqr(m[i][j]);
  end;
  
  procedure ProcessVector(const v: TDoubleArray);
  var
    i: Integer;
  begin
    for i := 0 to High(v) do
      norm := norm + Sqr(v[i]);
  end;
  
  procedure ScaleMatrix(var m: TDoubleMatrix; s: Double);
  var
    i, j: Integer;
  begin
    for i := 0 to High(m) do
      for j := 0 to High(m[i]) do
        m[i][j] := m[i][j] * s;
  end;
  
  procedure ScaleVector(var v: TDoubleArray; s: Double);
  var
    i: Integer;
  begin
    for i := 0 to High(v) do
      v[i] := v[i] * s;
  end;

begin
  norm := 0.0;
  
  // Считаем общую норму всех градиентов
  ProcessMatrix(grad.dWf);
  ProcessMatrix(grad.dWi);
  ProcessMatrix(grad.dWc);
  ProcessMatrix(grad.dWo);
  
  ProcessMatrix(grad.dUf);
  ProcessMatrix(grad.dUi);
  ProcessMatrix(grad.dUc);
  ProcessMatrix(grad.dUo);
  
  ProcessVector(grad.dbf);
  ProcessVector(grad.dbi);
  ProcessVector(grad.dbc);
  ProcessVector(grad.dbo);
  
  norm := Sqrt(norm);
  
  // Применяем clipping если норма превышена
  if norm > maxNorm then
  begin
    scale := maxNorm / (norm + 1e-6); // Добавляем небольшое значение для стабильности
    
    // Масштабируем все матрицы
    ScaleMatrix(grad.dWf, scale);
    ScaleMatrix(grad.dWi, scale);
    ScaleMatrix(grad.dWc, scale);
    ScaleMatrix(grad.dWo, scale);
    
    ScaleMatrix(grad.dUf, scale);
    ScaleMatrix(grad.dUi, scale);
    ScaleMatrix(grad.dUc, scale);
    ScaleMatrix(grad.dUo, scale);
    
    // Масштабируем векторы
    ScaleVector(grad.dbf, scale);
    ScaleVector(grad.dbi, scale);
    ScaleVector(grad.dbc, scale);
    ScaleVector(grad.dbo, scale);
  end;
end;

procedure UpdateWeights(var cell: TLSTMCell; const grad: TLSTMGradients; learningRate: Double);

  procedure UpdateMatrix(var m: TDoubleMatrix; const grad: TDoubleMatrix; lr: Double);
  var
    i, j: Integer;
  begin
    for i := 0 to High(m) do
      for j := 0 to High(m[i]) do
        m[i][j] := m[i][j] - lr * grad[i][j];
  end;
  
  procedure UpdateVector(var v: TDoubleArray; const grad: TDoubleArray; lr: Double);
  var
    i: Integer;
  begin
    for i := 0 to High(v) do
      v[i] := v[i] - lr * grad[i];
  end;

begin
  // Обновляем входные веса
  UpdateMatrix(cell.Wf, grad.dWf, learningRate);
  UpdateMatrix(cell.Wi, grad.dWi, learningRate);
  UpdateMatrix(cell.Wc, grad.dWc, learningRate);
  UpdateMatrix(cell.Wo, grad.dWo, learningRate);
  
  // Обновляем рекуррентные веса
  UpdateMatrix(cell.Uf, grad.dUf, learningRate);
  UpdateMatrix(cell.Ui, grad.dUi, learningRate);
  UpdateMatrix(cell.Uc, grad.dUc, learningRate);
  UpdateMatrix(cell.Uo, grad.dUo, learningRate);
  
  // Обновляем смещения
  UpdateVector(cell.bf, grad.dbf, learningRate);
  UpdateVector(cell.bi, grad.dbi, learningRate);
  UpdateVector(cell.bc, grad.dbc, learningRate);
  UpdateVector(cell.bo, grad.dbo, learningRate);
end;

procedure TrainLSTM(var layer: TLSTMLayer; 
                  const inputs: TDoubleMatrix;
                  const targets: TDoubleArray;
                  learningRate: Double;
                  epochs: Integer);
var
  epoch, t: Integer;
  grads: array of TLSTMGradients;
  cell: TLSTMCell;
  loss: Double;
begin
  if Length(layer.Cells) = 0 then
    raise Exception.Create('LSTM layer not initialized');

  cell := layer.Cells[0];
  SetLength(grads, Length(inputs));

  for epoch := 1 to epochs do
  begin
    // Forward pass
    LSTMLayerForward(layer, inputs);
    
    // Backward pass (BPTT)
    ZeroGradients(grads[High(grads)], layer.HiddenSize, layer.InputSize);
    
    for t := High(inputs) downto 0 do
    begin
      // Вычисляем градиенты для каждого шага времени
      ComputeGradients(cell, inputs[t], grads[t], grads[t+1]);
    end;
    
    // Обновляем веса
    for t := 0 to High(inputs) do
    begin
      ClipGradients(grads[t], 5.0); // Gradient clipping
      UpdateWeights(cell, grads[t], learningRate);
    end;
    
    if epoch mod 10 = 0 then
    begin
      loss := CalculateLoss(layer.Cells[0].h, targets);
      WriteLn('Epoch: ', epoch, ' Loss: ', loss:0:4);
    end;
  end;
end;

procedure ComputeGradients(var cell: TLSTMCell;
                         const x: TDoubleArray;
                         var grad, grad_next: TLSTMGradients);
var
  do_, di, df, dg, dc_temp, dtanh_c: TDoubleArray;
  dtanh: Double;
  i: Integer;
begin
  // Производная по hidden state
  dtanh_c := DTanhVector(cell.c);
  
  // Output gate gradients
  do_ := MultiplyVectors(grad_next.dh_next, dtanh_c);
  do_ := MultiplyVectors(do_, DSigmoidVector(cell.o));
  
  // Cell state gradients
  dc_temp := grad_next.dc_next;
  for i := 0 to High(dc_temp) do
    dc_temp[i] := dc_temp[i] + grad_next.dh_next[i] * cell.o[i] * dtanh_c[i];
  
  // Input gate gradients
  di := MultiplyVectors(dc_temp, cell.g);
  di := MultiplyVectors(di, DSigmoidVector(cell.i));
  
  // Forget gate gradients
  df := MultiplyVectors(dc_temp, cell.c_prev);
  df := MultiplyVectors(df, DSigmoidVector(cell.f));
  
  // Candidate gradients
  dg := MultiplyVectors(dc_temp, cell.i);
  dg := MultiplyVectors(dg, DTanhVector(cell.g));
  
  // Обновляем градиенты весов
  grad.dWf := OuterProduct(df, x);
  grad.dWi := OuterProduct(di, x);
  grad.dWc := OuterProduct(dg, x);
  grad.dWo := OuterProduct(do_, x);
  
  grad.dUf := OuterProduct(df, cell.h_prev);
  grad.dUi := OuterProduct(di, cell.h_prev);
  grad.dUc := OuterProduct(dg, cell.h_prev);
  grad.dUo := OuterProduct(do_, cell.h_prev);
  
  grad.dbf := df;
  grad.dbi := di;
  grad.dbc := dg;
  grad.dbo := do_;
  
  // Вычисляем градиенты для предыдущего состояния
  grad.dh_next := AddVectors(
    MatrixVectorMultiply(TransposeMatrix(cell.Uf), df),
    AddVectors(
      MatrixVectorMultiply(TransposeMatrix(cell.Ui), di),
      AddVectors(
        MatrixVectorMultiply(TransposeMatrix(cell.Uc), dg),
        MatrixVectorMultiply(TransposeMatrix(cell.Uo), do_)
      )
    )
  );
  
  grad.dc_next := MultiplyVectors(dc_temp, cell.f);
end;

function CalculateLoss(const predictions, targets: TDoubleArray): Double;
var
  i: Integer;
begin
  Result := 0;
  // MSE (Mean Squared Error)
  for i := 0 to High(predictions) do
    Result := Result + Sqr(predictions[i] - targets[i]);
  Result := Result / Length(predictions);
end;

end.