unit FuzzySearchUnit;
{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  SysUtils, Classes, ucs4unit, ucs4functionsunit, NeuralNetwork, Word2Vec, MatrixOps, Math;

type
  TFuzzySearchResult = record
    Text: string;
    Similarity: Double;
    OriginalIndex: Integer;
  end;
  TFuzzySearchResults = array of TFuzzySearchResult;

  TFuzzySearchEngine = class
  private
    FWordEmbeddings: TWordEmbeddings;
    FTexts: TStringList;
    FEmbeddings: TDoubleMatrix;
    procedure PrecomputeEmbeddings;
  public
    constructor Create(const Word2VecModelPath: string);
    destructor Destroy; override;
    procedure AddText(const Text: string);
    function Search(const Query: string; TopK: Integer = 5): TFuzzySearchResults;
//    procedure SaveToFile(const FileName: string);
//    procedure LoadFromFile(const FileName: string);
  end;

function StringToUCS4(const S: string): ucs4;
function CosineSimilarity(const A, B: TDoubleArray): Double;
function LevenshteinSimilarity(const S1, S2: ucs4): Double;

implementation

constructor TFuzzySearchEngine.Create(const Word2VecModelPath: string);
begin
  FWordEmbeddings := TWordEmbeddings.Create(Word2VecModelPath);
  FTexts := TStringList.Create;
  FTexts.OwnsObjects := True;
end;

destructor TFuzzySearchEngine.Destroy;
begin
  FWordEmbeddings.Free;
  FTexts.Free;
  SetLength(FEmbeddings, 0);
  inherited;
end;

procedure TFuzzySearchEngine.AddText(const Text: string);
begin
  FTexts.Add(Text);
  // Пересчитываем эмбеддинги при добавлении нового текста
  PrecomputeEmbeddings;
end;

procedure TFuzzySearchEngine.PrecomputeEmbeddings;
var
  i: Integer;
  TextEmbedding: TDoubleArray;
begin
  SetLength(FEmbeddings, FTexts.Count);
  for i := 0 to FTexts.Count - 1 do
  begin
    TextEmbedding := FWordEmbeddings.GetEmbedding(FTexts[i]);
    FEmbeddings[i] := TextEmbedding;
  end;
end;

function TFuzzySearchEngine.Search(const Query: string; TopK: Integer): TFuzzySearchResults;
var
  QueryEmbedding: TDoubleArray;
  Scores: array of Double;
  i, j, MinIndex: Integer;
  MinScore: Double;
  TempResult: TFuzzySearchResult;
begin
  // Получаем эмбеддинг запроса
  QueryEmbedding := FWordEmbeddings.GetEmbedding(Query);
  
  // Вычисляем схожесть для каждого текста
  SetLength(Scores, FTexts.Count);
  for i := 0 to FTexts.Count - 1 do
  begin
    Scores[i] := CosineSimilarity(QueryEmbedding, FEmbeddings[i]);
    // Добавляем коэффициент Левенштейна для точности
    Scores[i] := Scores[i] * 0.7 + LevenshteinSimilarity(
      StringToUCS4(Query), 
      StringToUCS4(FTexts[i])
    ) * 0.3;
  end;

  // Выбираем TopK лучших результатов
  SetLength(Result, Min(TopK, FTexts.Count));
  for i := 0 to High(Result) do
  begin
    MinScore := MaxDouble;
    MinIndex := 0;
    for j := 0 to FTexts.Count - 1 do
      if (Scores[j] < MinScore) and ((i = 0) or (Scores[j] > Result[i-1].Similarity)) then
      begin
        MinScore := Scores[j];
        MinIndex := j;
      end;
    
    Result[i].Text := FTexts[MinIndex];
    Result[i].Similarity := Scores[MinIndex];
    Result[i].OriginalIndex := MinIndex;
    Scores[MinIndex] := -1; // Исключаем из следующего выбора
  end;

  // Сортировка результатов по убыванию схожести
  for i := 0 to High(Result) - 1 do
    for j := i + 1 to High(Result) do
      if Result[i].Similarity < Result[j].Similarity then
      begin
        TempResult := Result[i];
        Result[i] := Result[j];
        Result[j] := TempResult;
      end;
end;

function StringToUCS4(const S: string): ucs4;
begin
  Result.Init;
  Result.FromUTF8(S);
end;

function CosineSimilarity(const A, B: TDoubleArray): Double;
var
  i: Integer;
  DotProduct, NormA, NormB: Double;
begin
  if (Length(A) = 0) or (Length(B) = 0) or (Length(A) <> Length(B)) then
    Exit(0.0);

  DotProduct := 0;
  NormA := 0;
  NormB := 0;
  
  for i := 0 to High(A) do
  begin
    DotProduct := DotProduct + A[i] * B[i];
    NormA := NormA + A[i] * A[i];
    NormB := NormB + B[i] * B[i];
  end;

  if (NormA = 0) or (NormB = 0) then
    Exit(0.0);

  Result := DotProduct / (Sqrt(NormA) * Sqrt(NormB));
end;

function LevenshteinSimilarity(const S1, S2: ucs4): Double;
var
  Distance: Integer;
  MaxLen: Integer;
begin
  Distance := LevenshteinDistance(S1, S2);
  MaxLen := Max(S1.Length, S2.Length);
  if MaxLen = 0 then Exit(1.0);
  Result := 1.0 - (Distance / MaxLen);
end;

// ... (реализация SaveToFile/LoadFromFile)
end.