unit Lemmatizer;

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}

{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

interface

uses
  SysUtils, Classes, StrUtils;

type
  TLemmatizer = class
  private
    FDictionary: TStringList;
    FExceptions: TStringList;
    FSuffixRules: TStringList;
    FPrefixRules: TStringList;
    procedure LoadDictionary(const FileName: string);
    procedure LoadRules(const FileName: string; Rules: TStringList);
    function ApplyRules(const Word: string; Rules: TStringList): string;
    function FindInDictionary(const Word: string): string;
  public
    constructor Create(const RulesDir: string = '');
    destructor Destroy; override;
    function Lemmatize(const Token: string): string;
    procedure AddException(const WordForm, Lemma: string);
  end;

implementation

constructor TLemmatizer.Create(const RulesDir: string = '');
begin
  FDictionary := TStringList.Create;
  FExceptions := TStringList.Create;
  FSuffixRules := TStringList.Create;
  FPrefixRules := TStringList.Create;

  // Настройка словарей
  FDictionary.Sorted := True;
  FDictionary.Duplicates := dupIgnore;
  FExceptions.Sorted := True;
  FExceptions.Duplicates := dupIgnore;

  // Загрузка данных
  if RulesDir <> '' then
  begin
    LoadDictionary(RulesDir + 'dictionary.txt');
    LoadRules(RulesDir + 'exceptions.txt', FExceptions);
    LoadRules(RulesDir + 'suffixes.txt', FSuffixRules);
    LoadRules(RulesDir + 'prefixes.txt', FPrefixRules);
  end;
end;

destructor TLemmatizer.Destroy;
begin
  FDictionary.Free;
  FExceptions.Free;
  FSuffixRules.Free;
  FPrefixRules.Free;
  inherited;
end;

procedure TLemmatizer.LoadDictionary(const FileName: string);
begin
  if FileExists(FileName) then
  begin
    FDictionary.LoadFromFile(FileName);
    FDictionary.Sorted := True;
  end;
end;

procedure TLemmatizer.LoadRules(const FileName: string; Rules: TStringList);
begin
  if FileExists(FileName) then
    Rules.LoadFromFile(FileName);
end;

function TLemmatizer.FindInDictionary(const Word: string): string;
var
  Index: Integer;
begin
  Index := FDictionary.IndexOfName(LowerCase(Word));
  if Index >= 0 then
    Result := FDictionary.ValueFromIndex[Index]
  else
    Result := '';
end;

function TLemmatizer.ApplyRules(const Word: string; Rules: TStringList): string;
var
  i: Integer;
  RuleParts: TStringArray;
begin
  Result := Word;
  for i := 0 to Rules.Count - 1 do
  begin
    RuleParts := SplitString(Rules[i], '=');
    if Length(RuleParts) = 2 then
    begin
      if AnsiEndsStr(RuleParts[0], Word) then
      begin
        Result := Copy(Word, 1, Length(Word) - Length(RuleParts[0])) + RuleParts[1];
        Exit;
      end;
    end;
  end;
end;

procedure TLemmatizer.AddException(const WordForm, Lemma: string);
begin
  FExceptions.Values[LowerCase(WordForm)] := LowerCase(Lemma);
end;

function TLemmatizer.Lemmatize(const Token: string): string;
var
  LowerToken: string;
  Index: Integer;
begin
  LowerToken := LowerCase(Token);

  // 1. Проверка исключений
  Index := FExceptions.IndexOfName(LowerToken);
  if Index >= 0 then
    Exit(FExceptions.ValueFromIndex[Index]);

  // 2. Поиск в словаре
  Result := FindInDictionary(LowerToken);
  if Result <> '' then
    Exit;

  // 3. Применение правил для суффиксов
  Result := ApplyRules(LowerToken, FSuffixRules);

  // 4. Проверка результата в словаре
  if (Result <> LowerToken) and (FindInDictionary(Result) <> '') then
    Exit;

  // 5. Применение правил для префиксов
  Result := ApplyRules(LowerToken, FPrefixRules);

  // 6. Если ничего не помогло, возвращаем исходное слово
  if Result = LowerToken then
    Result := Token
  else if FindInDictionary(Result) = '' then
    Result := Token;
end;

end.