unit OpenCorporaTxtConverter;
{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}

interface

uses
  SysUtils, Classes, StrUtils;

procedure ConvertOpenCorporaTxtToDict(const InputFile, OutputFile: string);

implementation

const
  ForbiddenPrefixes: array[0..10] of string = (
    'ёб', 'ёп', 'ху', 'пид', 'бля', 'ебу', 'пизд', 'ебан', 'манда', 'хер', 'залуп'
  );

function IsForbiddenWord(const Word: string): Boolean;
var
  i: Integer;
  LowerWord: string;
begin
  LowerWord := LowerCase(Word);
  Result := False;
  for i := 0 to High(ForbiddenPrefixes) do
  begin
    if Pos(ForbiddenPrefixes[i], LowerWord) = 1 then
    begin
      Result := True;
      Exit;
    end;
  end;
end;

procedure ConvertOpenCorporaTxtToDict(const InputFile, OutputFile: string);
var
  Input, Output, Lemmas: TStringList;
  i, SpacePos: Integer;
  Line, WordForm, Lemma, Tags: string;
begin
  Input := TStringList.Create;
  Output := TStringList.Create;
  Lemmas := TStringList.Create;
  try
    // Загрузка данных
    WriteLn('Loading input file...');
    Input.LoadFromFile(InputFile);
    
    WriteLn('Processing ', Input.Count, ' lines...');
    
    // Первый проход: сбор всех лемм
    for i := 0 to Input.Count - 1 do
    begin
      Line := Trim(Input[i]);
      if (Line = '') or (Line[1] in ['0'..'9']) then Continue;
      
      SpacePos := Pos(' ', Line);
      if SpacePos = 0 then Continue;
      
      WordForm := Trim(Copy(Line, 1, SpacePos - 1));
      Tags := Trim(Copy(Line, SpacePos + 1, Length(Line)));
      
      if not IsForbiddenWord(WordForm) and 
         (Pos('NOUN', Tags) > 0) and 
         (Pos('nomn', Tags) > 0) and 
         (Pos('sing', Tags) > 0) then
      begin
        Lemmas.Values[WordForm] := Tags; // Сохраняем леммы существительных
      end;
    end;
    
    // Второй проход: создание словаря
    for i := 0 to Input.Count - 1 do
    begin
      Line := Trim(Input[i]);
      if (Line = '') or (Line[1] in ['0'..'9']) then Continue;
      
      SpacePos := Pos(' ', Line);
      if SpacePos = 0 then Continue;
      
      WordForm := Trim(Copy(Line, 1, SpacePos - 1));
      Tags := Trim(Copy(Line, SpacePos + 1, Length(Line)));
      
      if IsForbiddenWord(WordForm) then Continue;
      
      // Определение леммы
      if (Pos('NOUN', Tags) > 0) then
      begin
        if (Pos('nomn', Tags) > 0) and (Pos('sing', Tags) > 0) then
          Lemma := WordForm // Это лемма
        else
          // Ищем соответствующую лемму
          Lemma := Lemmas.Names[Lemmas.IndexOfName(WordForm)];
      end
      else
        Lemma := WordForm; // Для других частей речи
      
      if Lemma <> '' then
        Output.Add(WordForm + '|' + Lemma);
      
      if i mod 100000 = 0 then
        WriteLn('Processed ', i, ' of ', Input.Count, ' lines');
    end;
    
    // Удаление дубликатов
    Output.Sorted := True;
    Output.Duplicates := dupIgnore;
    
    WriteLn('Saving results...');
    Output.SaveToFile(OutputFile);
    WriteLn('Done. Saved ', Output.Count, ' entries.');
  finally
    Input.Free;
    Output.Free;
    Lemmas.Free;
  end;
end;

end.