TextEncodings.pas

{******************************************************************************}
{                                                                              }
{ Text Encodings Utilities                                                     }
{                                                                              }
{ Copyright © 2016 Andreas Rejbrand                                            }
{                                                                              }
{ http://english.rejbrand.se/                                                  }
{                                                                              }
{******************************************************************************}

unit TextEncodings;

interface

uses SysUtils, Classes;

type
  TUTF32LEEncoding = class(TEncoding)
  strict protected
    function GetByteCount(Chars: PChar; CharCount: Integer): Integer; overload; override;
    function GetBytes(Chars: PChar; CharCount: Integer; Bytes: PByte; ByteCount: Integer): Integer; overload; override;
    function GetCharCount(Bytes: PByte; ByteCount: Integer): Integer; overload; override;
    function GetChars(Bytes: PByte; ByteCount: Integer; Chars: PChar; CharCount: Integer): Integer; overload; override;
  public
    constructor Create; virtual;
    function GetMaxByteCount(CharCount: Integer): Integer; override;
    function GetMaxCharCount(ByteCount: Integer): Integer; override;
    function GetPreamble: TBytes; override;
  end;

  TUTF32BEEncoding = class(TUTF32LEEncoding)
  strict protected
    function GetBytes(Chars: PChar; CharCount: Integer; Bytes: PByte; ByteCount: Integer): Integer; overload; override;
    function GetChars(Bytes: PByte; ByteCount: Integer; Chars: PChar; CharCount: Integer): Integer; overload; override;
  public
    function GetPreamble: TBytes; override;
  end;

var
  UTF32LEEncoding: TUTF32LEEncoding;
  UTF32BEEncoding: TUTF32BEEncoding;

type
  THeuristicsSetting = (hsQuick, hsNormal, hsCareful);

  TTextEncoding = (teASCII, teWindows8bitCodepage, teUTF8, teUTF16LE, teUTF16BE,
    teUTF32LE, teUTF32BE);
  TTextEncodings = set of TTextEncoding;

function GetVCLEncoding(ATextEncoding: TTextEncoding): TEncoding;
function GetEncodingFromVCL(AEncoding: TEncoding): TTextEncoding;

const
  TextEncodingsStrs: array[TTextEncoding] of string =
    ('teASCII', 'teWindows8bitCodepage', 'teUTF8', 'teUTF16LE', 'teUTF16BE',
     'teUTF32LE', 'teUTF32BE');

type
  TLineBreakType = (lbtCRLF, lbtCR, lbtLF);

  PTextFileFormatInfo = ^TTextFileFormatInfo;
  TTextFileFormatInfo = record
    TextEncoding: TTextEncoding;
    MagicWord: boolean;
    LineBreakType: TLineBreakType;
    procedure SetEncoding(ATextEncoding: TTextEncoding);
    procedure SetMagicWord(AMagicWord: boolean);
    procedure SetLineBreakType(ALineBreakType: TLineBreakType);
    function GetVCLEncoding: TEncoding;
    procedure SetHasMagicWord(MagicWords: TTextEncodings);
  end;

const
  DEFAULT_TEXT_FILE_FORMAT_INFO: TTextFileFormatInfo = (TextEncoding: teUTF8;
    MagicWord: false; LineBreakType: lbtCRLF);

var
  PreferASCII: boolean = false;
  PreferANSI: boolean = false;

function GuessEncodingOfFile(const FileName: TFileName;
  out PositiveEncoding: TTextEncoding;
  out PossibleEncodings: TTextEncodings;
  out MagicWordsFound: TTextEncodings;
  HeuristicsSetting: THeuristicsSetting = hsNormal): boolean;

implementation

uses Windows, Math;

function EncodingCount(const ASet: TTextEncodings): integer;
var
  encoding: TTextEncoding;
begin
  result := 0;
  for encoding in ASet do
    inc(result);
end;

function Swap32(Value: cardinal): cardinal; register;
asm
  bswap eax
end;

function Swap16(Value: word): word; inline;
begin
  result := (Value shl 8) or (Value shr 8);
end;

function MagicBytesMatching(const Buf, MagicBytes: array of byte): boolean;
var
  i: Integer;
begin
  result := false;
  if Length(Buf) < Length(MagicBytes) then
    Exit;
  for i := 0 to high(MagicBytes) do
    if Buf[i] <> MagicBytes[i] then
      Exit;
  result := true;
end;

function GuessEncodingOfFile(const FileName: TFileName;
  out PositiveEncoding: TTextEncoding;
  out PossibleEncodings: TTextEncodings;
  out MagicWordsFound: TTextEncodings;
  HeuristicsSetting: THeuristicsSetting = hsNormal): boolean;

const
  BUFFER_SIZE = 1024*1024;

type
  TUTF8ByteType = (utf8single, utf8leading, utf8continuation);
  TUTF16WordType = (utf16single, utf16HighSurrogate, utf16LowSurrogate);
  PWordArray = ^TWordArray;
  TWordArray = array[0..BUFFER_SIZE div 2 - 1] of word;
  PCardinalArray = ^TCardinalArray;
  TCardinalArray = array[0..BUFFER_SIZE div 4 - 1] of cardinal;

const
  UTF8: array[0..2] of byte = ($EF, $BB, $BF);
  UTF16LE: array[0..1] of byte = ($FF, $FE);
  UTF16BE: array[0..1] of byte = ($FE, $FF);
  UTF32LE: array[0..3] of byte = ($FF, $FE, 0, 0);
  UTF32BE: array[0..3] of byte = (0, 0, $FE, $FF);

var
  teASCIIAnsiUTF8: TTextEncoding;
  NontrivialPossibilities: TTextEncodings;
  nulls, oddnulls, evennulls, nullwords: Int64;

  crlf, sp: Int64;
  wrd0020, wrd2000: Int64;
  wrd000D, wrd0D00: Int64;
  wrd000A, wrd0A00: Int64;
  wrdLEPUA, wrdBEPUA: Int64;

  FS: TFileStream;
  bytes: array of byte;
  words: PWordArray;
  cardinals: PCardinalArray;
  len: integer;

  FSize, FSizeRead: Int64;

  MagicUTF8,
  MagicUTF16LE,
  MagicUTF16BE,
  MagicUTF32LE,
  MagicUTF32BE: boolean;

  i: integer;

  utf8type: TUTF8ByteType;
  utf8count: integer;
  utf8oversizewarning: boolean;
  utf8cp: integer;

  utf16type: TUTF16WordType;
  utf16unit: word;
  SurrogatePairsLE: Int64;
  SurrogatePairsBE: Int64;
  SurrogatePairsMax: Int64;

  utf16cardinal: cardinal;

  e: TTextEncoding;

  tc1: cardinal;

type
  TUTF16TestValue = (utf16tvLE, utf16tvBE, utf16tvFail);

  function TestUTF16: TUTF16TestValue;
  begin

    if (teUTF16LE in PossibleEncodings) and not (teUTF16BE in PossibleEncodings) then
      Exit(utf16tvLE)
    else if (teUTF16BE in PossibleEncodings) and not (teUTF16LE in PossibleEncodings) then
      Exit(utf16tvBE)
    else if (not (teUTF16BE in PossibleEncodings)) and (not (teUTF16LE in PossibleEncodings)) then
      Exit(utf16tvFail);

    // odd or even nulls?
    if (oddnulls > 4) and (oddnulls > 2*evennulls) then
      Exit(utf16tvLE)
    else if (evennulls > 4) and (evennulls > 2*oddnulls) then
      Exit(utf16tvBE);

    // U+000D: Carriage return (CR) probably more common than U+0D00: (no descr.)
    if wrd000D > wrd0D00 then
      Exit(utf16tvLE)
    else if wrd0D00 > wrd000D then
      Exit(utf16tvBE);

    // U+000A: Line feed (LF) probably more common than U+0A00: (no descr.)
    if wrd000A > wrd0A00 then
      Exit(utf16tvLE)
    else if wrd0A00 > wrd000A then
      Exit(utf16tvBE);

    // U+0020: SPACE probably more common than U+2000: EN QUAD
    if wrd0020 > wrd2000 then
      Exit(utf16tvLE)
    else if wrd2000 > wrd0020 then
      Exit(utf16tvBE);

    // Look for PUA
    if (wrdLEPUA = 0) and (wrdBEPUA > 0) then
      Exit(utf16tvLE)
    else if (wrdLEPUA > 0) and (wrdBEPUA = 0) then
      Exit(utf16tvBE);

    // Windows default
    Exit(utf16tvLE);

  end;

  function TestUTF16PUA: TUTF16TestValue;
  begin
    if (wrdLEPUA = 0) and (wrdBEPUA > 0) and (teUTF16LE in PossibleEncodings) then
      Exit(utf16tvLE)
    else if (wrdLEPUA > 0) and (wrdBEPUA = 0) and (teUTF16BE in PossibleEncodings) then
      Exit(utf16tvBE)
    else
      Exit(utf16tvFail);
  end;

begin

  PositiveEncoding := teWindows8bitCodepage;
  result := false;
  PossibleEncodings := [teASCII, teWindows8bitCodepage, teUTF8, teUTF16LE,
    teUTF16BE, teUTF32LE, teUTF32BE];

  if PreferASCII then
    teASCIIAnsiUTF8 := teASCII
  else if PreferANSI then
    teASCIIAnsiUTF8 := teWindows8bitCodepage
  else
    teASCIIAnsiUTF8 := teUTF8;

  FS := TFileStream.Create(FileName, fmOpenRead);
  try

    // Empty file?
    FSize := FS.Size;
    FSizeRead := FSize;

    if FSize = 0 then
    begin
      PositiveEncoding := teASCIIAnsiUTF8;
      Exit(true);                                                                {*}
    end;

    // Search for magic bytes (BOMs)
    MagicUTF8 := false;
    MagicUTF16LE := false;
    MagicUTF16BE := false;
    MagicUTF32LE := false;
    MagicUTF32BE := false;

    SetLength(bytes, 4);
    len := FS.Read(bytes[0], length(bytes));
    SetLength(bytes, len);

    MagicWordsFound := [];

    if MagicBytesMatching(bytes, UTF32LE) then
    begin
      MagicUTF32LE := true;
      Include(MagicWordsFound, teUTF32LE);
      if HeuristicsSetting = hsQuick then
      begin
        PositiveEncoding := teUTF32LE;
        Exit(true);
      end;
    end;

    if MagicBytesMatching(bytes, UTF32BE) then
    begin
      MagicUTF32BE := true;
      Include(MagicWordsFound, teUTF32BE);
      if HeuristicsSetting = hsQuick then
      begin
        PositiveEncoding := teUTF32BE;
        Exit(true);
      end;
    end;

    if MagicBytesMatching(bytes, UTF8) then
    begin
      MagicUTF8 := true;
      Include(MagicWordsFound, teUTF8);
      if HeuristicsSetting = hsQuick then
      begin
        PositiveEncoding := teUTF8;
        Exit(true);
      end;
    end;

    if MagicBytesMatching(bytes, UTF16LE) then
    begin
      MagicUTF16LE := true;
      Include(MagicWordsFound, teUTF16LE);
      if HeuristicsSetting = hsQuick then
      begin
        PositiveEncoding := teUTF16LE;
        Exit(true);
      end;
    end;

    if MagicBytesMatching(bytes, UTF16BE) then
    begin
      MagicUTF16BE := true;
      Include(MagicWordsFound, teUTF16BE);
      if HeuristicsSetting = hsQuick then
      begin
        PositiveEncoding := teUTF16BE;
        Exit(true);
      end;
    end;

    // Check format conformance
    if FSize mod 2 <> 0 then
    begin
      Exclude(PossibleEncodings, teUTF16LE);
      Exclude(PossibleEncodings, teUTF16BE);
    end;

    if FSize mod 4 <> 0 then
    begin
      Exclude(PossibleEncodings, teUTF32LE);
      Exclude(PossibleEncodings, teUTF32BE);
    end;

    Assert(BUFFER_SIZE mod 4 = 0);

    nulls := 0;
    oddnulls := 0;
    evennulls := 0;
    nullwords := 0;
    crlf := 0;
    sp := 0;
    wrd0020 := 0;
    wrd2000 := 0;
    wrd000D := 0;
    wrd0D00 := 0;
    wrd000A := 0;
    wrd0A00 := 0;
    wrdLEPUA := 0;
    wrdBEPUA := 0;
    SurrogatePairsLE := 0;
    SurrogatePairsBE := 0;

    utf8type := utf8single;
    utf8count := 0;
    utf8oversizewarning := false; // to get rid of W1036

    utf16type := utf16single;

    tc1 := GetTickCount;

    FS.Position := 0;

    while FS.Position < FSize do
    begin
      if PossibleEncodings = [teWindows8bitCodepage] then
        break;                                                                   { <-- break }

      if HeuristicsSetting <> hsCareful then
        if GetTickCount - tc1 > 1000 then
        begin
          FSizeRead := FS.Position;
          break;
        end;

      SetLength(bytes, BUFFER_SIZE);
      len := FS.Read(bytes[0], BUFFER_SIZE);
      if len <> BUFFER_SIZE then
        SetLength(bytes, len);

      words := PWordArray(@bytes[0]);
      cardinals := PCardinalArray(@bytes[0]);

      for i := 0 to length(bytes) div 2 - 1 do
        if words[i] = 0 then
          inc(nullwords)
        else if words[i] = $0D0A then
          inc(crlf)
        else if words[i] = $0020 then
          inc(wrd0020)
        else if words[i] = $2000 then
          inc(wrd2000)
        else if words[i] = $000D then
          inc(wrd000D)
        else if words[i] = $0D00 then
          inc(wrd0D00)
        else if words[i] = $000A then
          inc(wrd000A)
        else if words[i] = $0A00 then
          inc(wrd0A00)
        else if InRange(words[i], $E000, $F8FF) then
          inc(wrdLEPUA)
        else if InRange(Swap16(words[i]), $E000, $F8FF) then
          inc(wrdBEPUA);


      for i := 0 to high(bytes) do
        if bytes[i] = 0 then
        begin
          inc(nulls);
          if odd(i) then
            inc(oddnulls);
        end
        else if bytes[i] = $20 then
          inc(sp);

      evennulls := nulls - oddnulls;

      // ASCII
      if teASCII in PossibleEncodings then
        for i := 0 to high(bytes) do
          if bytes[i] > 127 then
          begin
            Exclude(PossibleEncodings, teASCII);
            break;
          end;

      // UTF-8
      if teUTF8 in PossibleEncodings then
      begin

        if not (teASCII in PossibleEncodings) then // ASCII valid >= UTF-8 valid
          for i := 0 to high(bytes) do
          begin
            if bytes[i] <= 127 then // 0xxxxxxx: single byte
            begin
              if utf8count > 0 then
              begin
                Exclude(PossibleEncodings, teUTF8);
                break;
              end;
              utf8type := utf8single;
            end
            else if bytes[i] and $C0 = $80 then // 10xxxxxx: continuation byte
            begin
              if (utf8type = utf8single) or (utf8count = 0) then
              begin
                Exclude(PossibleEncodings, teUTF8);
                break;
              end;
              utf8type := utf8continuation;
              dec(utf8count);
              if utf8oversizewarning then
              begin
                inc(utf8cp, bytes[i] and $3F);
                if (utf8count = 0) and (utf8cp > $10FFFF) then
                begin
                  Exclude(PossibleEncodings, teUTF8);
                  break;
                end;
              end;
            end
            else
            begin // 11xxxxxx: possibly leading byte
              if (utf8type = utf8leading) or (utf8count > 0) then
              begin
                Exclude(PossibleEncodings, teUTF8);
                break;
              end;

              utf8type := utf8leading;
              utf8oversizewarning := false;

              if (bytes[i] and $E0 = $C0) then // 110xxxxx: unit length = 2
                utf8count := 1 // 1 remaining
              else if (bytes[i] and $F0 = $E0) then // 1110xxxx: unit length = 3
                utf8count := 2 // 2 remaining
              else if (bytes[i] and $F8 = $F0) then // 11110xxx: unit length = 4
              begin
                utf8count := 3; // 3 remaining
                utf8oversizewarning := true; // the encoded codepoint might exceed $10FFFF (4 UTF-8 bytes, i.e. 21 effective bits, gives a theoretical max cp of $1FFFFF)
                utf8cp := bytes[i] and $7;
              end
              else
              begin // invalid byte
                Exclude(PossibleEncodings, teUTF8);
                break;
              end;
            end;
          end;

      end;

      // UTF-16LE
      if teUTF16LE in PossibleEncodings then
        for i := 0 to length(bytes) div 2 - 1 do
        begin
          utf16unit := words[i];
          if (utf16unit <= $D7FF) or (utf16unit >= $E000) then // BMP char
          begin
            if utf16type = utf16HighSurrogate then
            begin
              Exclude(PossibleEncodings, teUTF16LE);
              break;
            end;
            utf16type := utf16single;
          end
          else if utf16unit < $DC00 then // high surrogate
          begin
            if utf16type = utf16HighSurrogate then
            begin
              Exclude(PossibleEncodings, teUTF16LE);
              break;
            end;
            utf16type := utf16HighSurrogate;
          end
          else // low surrogate
          begin
            if utf16type <> utf16HighSurrogate then
            begin
              Exclude(PossibleEncodings, teUTF16LE);
              break;
            end;
            utf16type := utf16LowSurrogate;
            inc(SurrogatePairsLE);
          end;
        end;

      // UTF-16BE
      if teUTF16BE in PossibleEncodings then
        for i := 0 to length(bytes) div 2 - 1 do
        begin
          utf16unit := Swap16(words[i]);
          if (utf16unit <= $D7FF) or (utf16unit >= $E000) then // BMP char
          begin
            if utf16type = utf16HighSurrogate then
            begin
              Exclude(PossibleEncodings, teUTF16BE);
              break;
            end;
            utf16type := utf16single;
          end
          else if utf16unit < $DC00 then // high surrogate
          begin
            if utf16type = utf16HighSurrogate then
            begin
              Exclude(PossibleEncodings, teUTF16BE);
              break;
            end;
            utf16type := utf16HighSurrogate;
          end
          else // low surrogate
          begin
            if utf16type <> utf16HighSurrogate then
            begin
              Exclude(PossibleEncodings, teUTF16BE);
              break;
            end;
            utf16type := utf16LowSurrogate;
            inc(SurrogatePairsBE);
          end;
        end;

      // UTF-32LE
      if teUTF32LE in PossibleEncodings then
        for i := 0 to length(bytes) div 4 - 1 do
          if cardinals[i] > $10FFFF then
          begin
            Exclude(PossibleEncodings, teUTF32LE);
            break;
          end;

      // UTF-32BE
      if teUTF32BE in PossibleEncodings then
        for i := 0 to length(bytes) div 4 - 1 do
        begin
          utf16cardinal := Swap32(cardinals[i]);
          if utf16cardinal > $10FFFF then
          begin
            Exclude(PossibleEncodings, teUTF32BE);
            break;
          end;
        end;

    end {while};

  finally
    FS.Free;
  end;

  SurrogatePairsMax := max(SurrogatePairsLE, SurrogatePairsBE);

  // Make a decision
  NontrivialPossibilities := PossibleEncodings - [teWindows8bitCodepage];

  if EncodingCount(NontrivialPossibilities) = 0 then // obvious
  begin
    PositiveEncoding := teWindows8bitCodepage;
    Exit(true);                                                                  {*}             { break-sensitive code ends here }
  end;

  if nulls = FSizeRead then
  begin
    PositiveEncoding := teWindows8bitCodepage;
    Exit(true);                                                                  {*}
  end;

  if MagicUTF8 and (teUTF8 in PossibleEncodings) then
  begin
    PositiveEncoding := teUTF8;                                                  {*}
    Exit(true);
  end
  else if MagicUTF32LE and (teUTF32LE in PossibleEncodings) then
  begin
    PositiveEncoding := teUTF32LE;
    Exit(true);                                                                  {*}
  end
  else if MagicUTF32BE and (teUTF32BE in PossibleEncodings) then
  begin
    PositiveEncoding := teUTF32BE;
    Exit(true);                                                                  {*}
  end
  else if MagicUTF16LE and (teUTF16LE in PossibleEncodings) then
  begin
    PositiveEncoding := teUTF16LE;
    Exit(true);                                                                  {*}
  end
  else if MagicUTF16BE and (teUTF16BE in PossibleEncodings) then
  begin
    PositiveEncoding := teUTF16BE;
    Exit(true);                                                                  {*}
  end;

  if (teUTF8 in PossibleEncodings) and (not (teASCII in PossibleEncodings)) and (nulls = 0) then // obvious
  begin
    PositiveEncoding := teUTF8;
    Exit(true);                                                                  {*}
  end;

  if (teASCII in PossibleEncodings) and (nulls = 0) then
  begin

    // Also in PossibleEncodings: Ansi, UTF-8, UTF-16
    // Not in set: UTF-32
    // Set unique
    // ASCII, Ansi and UTF-8 identical
    // teASCIIAnsiUTF8 or UTF-16?
    // <= 127 but no nulls => unlikely UTF-16
    PositiveEncoding := teASCIIAnsiUTF8;
    Exit(true);                                                                  {*}

  end
  else if (teASCII in PossibleEncodings) {and (nulls > 0)} then
  begin

    // Also in PossibleEncodings: Ansi, UTF-8, UTF-16
    // Possibly in set: UTF-32
    // Nulls suggests not ASCII, Ansi or UTF-8
    // UTF-16 or UTF-32 (if even possible)?

    if nullwords > 0 then
    begin
      // null words suggest not UTF-16
      if teUTF32LE in PossibleEncodings then
      begin
        PositiveEncoding := teUTF32LE;
        Exit(true);                                                              {*}
      end;

      if teUTF32BE in PossibleEncodings then
      begin
        PositiveEncoding := teUTF32BE;
        Exit(true);                                                              {*}
      end;

      // Apparently not...
      // Set unique
      // Notice: we have to live with strange nulls...
      // Could be ASCII chset with nulls as UTF-16
      // teASCIIAnsiUTF8 or UTF-16?
      if (nullwords = 2*nulls) or (nulls > 10*nullwords) then
      begin
        // All nulls are (aligned) words: probably UTF-16
        // or
        // Some nullwords, but many nulls, probably UTF-16

        case TestUTF16 of
          utf16tvLE:
            begin
              PositiveEncoding := teUTF16LE;
              Exit(true);                                                        {*}
            end;
          utf16tvBE:
            begin
              PositiveEncoding := teUTF16BE;
              Exit(true);                                                        {*}
            end;
        end;
      end;

      // Stop guessing: do default choice
      PositiveEncoding := teASCIIAnsiUTF8;
      Exit(true);                                                                {*}

    end
    else
    begin // no nullwords

      // Recall: ASCII possible but null bytes
      // UTF-16 or UTF-32 probable
      // Likely ASCII chest in UTF-16 or UTF-32
      // No null words, then, implies UTF-16 likely

      case TestUTF16 of
        utf16tvLE:
          begin
            PositiveEncoding := teUTF16LE;
            Exit(true);                                                          {*}
          end;
        utf16tvBE:
          begin
            PositiveEncoding := teUTF16BE;
            Exit(true);                                                          {*}
          end;
      end;

    end;

  end
  else {not ASCII} if nulls = 0 then
  begin

    // Possibly in PossibleEncodings: ANSI (certainly), UTF-8, UTF-16
    // Not possible: UTF-32 (since no nulls)
    // Already exited if UTF-8, hence: ANSI (certainly), UTF-16

    // ANSI or UTF-16?
    // Unlikely a UTF-16 Western text or computer code if no null bytes;
    // indeed, there cannot even be spaces and newlines.
    // One-line Chinese text?

    // CRLF in UTF-16LE: U+0D0A: MALAYALAM LETTER UU (ഊ)
    // CRLF in UTF-16BE: <reserved>
    if crlf > 0 then
    begin
      PositiveEncoding := teWindows8bitCodepage;
      Exit(true);                                                                {*}
    end;

    // Many spaces, most words are < 12 chars
    // Unlikely UTF-16
    if sp > FSizeRead div 12 then
    begin
      PositiveEncoding := teWindows8bitCodepage;
      Exit(true);                                                                {*}
    end;

    // No ANSI crlf or sp, and not UTF-16 crlf or sp...
    // A long line of Chinese text?
    if (FSizeRead > 4096) or (SurrogatePairsMax > FSizeRead div (4*4)) then
    begin
      // Since no null and not ASCII, UTF16 does make a few requirements
      // In a "long" text, it is "somewhat" unlikely that "random" ANSI is valid UTF-16
      // or...
      // Unlikely with many perfect surrogate pairs in "random" ANSI

      case TestUTF16 of
        utf16tvLE:
          begin
            PositiveEncoding := teUTF16LE;
            Exit(true);                                                          {*}
          end;
        utf16tvBE:
          begin
            PositiveEncoding := teUTF16BE;
            Exit(true);                                                          {*}
          end;
      end;
    end;

    if SurrogatePairsMax = 0 then
    begin
      // and also no null bytes
      case TestUTF16PUA of
        utf16tvLE:
          begin
            PositiveEncoding := teUTF16LE;
            Exit(false);                                                         {**}
          end;
        utf16tvBE:
          begin
            PositiveEncoding := teUTF16BE;
            Exit(false);                                                         {**}
          end;
        utf16tvFail:
          begin
            PositiveEncoding := teWindows8bitCodepage;
            Exit(false);                                                         {**}
          end;
      end;
    end;

    // Does have surrogate pairs
    case TestUTF16 of
      utf16tvLE:
        begin
          PositiveEncoding := teUTF16LE;
          Exit(true);                                                            {*}
        end;
      utf16tvBE:
        begin
          PositiveEncoding := teUTF16BE;
          Exit(true);                                                            {*}
        end;
    end;

    // Cannot be UTF16
    PositiveEncoding := teWindows8bitCodepage;
    Exit(true);

  end
  else
  begin // not ASCII, does have null bytes

    // Possibly in PossibleEncodings: ANSI (certainly), UTF-8, UTF-16, UTF-32
    // Null bytes make ANSI and UTF-8 unlikely
    // UTF-16 or UTF-32?

    if nullwords > 0 then
    begin
      // UTF-16 unlikely

      if teUTF32LE in PossibleEncodings then
      begin
        PositiveEncoding := teUTF32LE;
        Exit(true);                                                              {*}
      end;

      if teUTF32BE in PossibleEncodings then
      begin
        PositiveEncoding := teUTF32BE;
        Exit(true);                                                              {*}
      end;

      // Nullwords but not UTF-32
      PositiveEncoding := teWindows8bitCodepage;
      Exit(true);                                                                {*}

    end
    else
    begin
      // No nullwords
      // UTF-16 or UTF-32?

      if SurrogatePairsMax > FSizeRead div (4*20) then
      begin
        // Unlikely in UTF-32

        case TestUTF16 of
          utf16tvLE:
            begin
              PositiveEncoding := teUTF16LE;
              Exit(true);                                                        {*}
            end;
          utf16tvBE:
            begin
              PositiveEncoding := teUTF16BE;
              Exit(true);                                                        {*}
            end;
        end;
      end;

      // Valid UTF-32 unlikely in "random" UTF-16
      if teUTF32LE in PossibleEncodings then
      begin
        PositiveEncoding := teUTF32LE;
        Exit(true);                                                              {*}
      end;

      if teUTF32BE in PossibleEncodings then
      begin
        PositiveEncoding := teUTF32BE;
        Exit(true);                                                              {*}
      end;

      // Not UTF-32
      case TestUTF16 of
        utf16tvLE:
          begin
            PositiveEncoding := teUTF16LE;
            Exit(true);                                                          {*}
          end;
        utf16tvBE:
          begin
            PositiveEncoding := teUTF16BE;
            Exit(true);                                                          {*}
          end;
      end;

      // Not UTF-16 either
      PositiveEncoding := teWindows8bitCodepage;
      Exit(true);                                                                {*}

    end;

  end;

  if result then
    Assert(PositiveEncoding in PossibleEncodings, 'PositiveEncoding not in PossibleEncodings');


end;

function GetVCLEncoding(ATextEncoding: TTextEncoding): TEncoding;
begin
  result := nil;
  case ATextEncoding of
    teASCII:
      result := TEncoding.ASCII;
    teWindows8bitCodepage:
      result := TEncoding.Default;
    teUTF8:
      result := TEncoding.UTF8;
    teUTF16LE:
      result := TEncoding.Unicode;
    teUTF16BE:
      result := TEncoding.BigEndianUnicode;
    teUTF32LE:
      result := TextEncodings.UTF32LEEncoding;
    teUTF32BE:
      result := TextEncodings.UTF32BEEncoding;
  end;
end;

function GetEncodingFromVCL(AEncoding: TEncoding): TTextEncoding;
begin
  result := teWindows8bitCodepage;
  if AEncoding = TEncoding.ASCII then
    result := teASCII
  else if AEncoding = TEncoding.Default then
    result := teWindows8bitCodepage
  else if AEncoding = TEncoding.UTF8 then
    result := teUTF8
  else if AEncoding = TEncoding.Unicode then
    result := teUTF16LE
  else if AEncoding = TEncoding.BigEndianUnicode then
    result := teUTF16BE
  else if AEncoding = TextEncodings.UTF32LEEncoding then
    result := teUTF32LE
  else if AEncoding = TextEncodings.UTF32BEEncoding then
    result := teUTF32BE;
end;

{ TTextFileFormatInfo }

function TTextFileFormatInfo.GetVCLEncoding: TEncoding;
begin
  result := TextEncodings.GetVCLEncoding(Self.TextEncoding);
end;

procedure TTextFileFormatInfo.SetEncoding(ATextEncoding: TTextEncoding);
begin
  TextEncoding := ATextEncoding;
end;

procedure TTextFileFormatInfo.SetHasMagicWord(MagicWords: TTextEncodings);
begin
  MagicWord := TextEncoding in MagicWords;
end;

procedure TTextFileFormatInfo.SetLineBreakType(ALineBreakType: TLineBreakType);
begin
  LineBreakType := ALineBreakType;
end;

procedure TTextFileFormatInfo.SetMagicWord(AMagicWord: boolean);
begin
  MagicWord := AMagicWord;
end;

{ TUTF32LEEncoding }

constructor TUTF32LEEncoding.Create;
begin
  FIsSingleByte := false;
  FMaxCharSize := 4;
end;

function TUTF32LEEncoding.GetByteCount(Chars: PChar;
  CharCount: Integer): Integer;
begin
  result := 4*CharCount;
end;

function TUTF32LEEncoding.GetBytes(Chars: PChar; CharCount: Integer;
  Bytes: PByte; ByteCount: Integer): Integer;
var
  i: Integer;
begin
  for i := 0 to CharCount - 1 do
  begin
    PCardinal(Bytes)^ := cardinal(ord(Chars^));
    inc(Chars);
    inc(Bytes, 4);
  end;
  result := 4*CharCount;
end;

function TUTF32LEEncoding.GetCharCount(Bytes: PByte;
  ByteCount: Integer): Integer;
begin
  result := ByteCount div 4;
end;

function TUTF32LEEncoding.GetChars(Bytes: PByte; ByteCount: Integer;
  Chars: PChar; CharCount: Integer): Integer;
var
  i: Integer;
begin
  for i := 0 to CharCount - 1 do
  begin
    Chars^ := Char(Word(PCardinal(Bytes)^));
    inc(Chars);
    inc(Bytes, 4);
  end;
  result := ByteCount div 4;
end;

function TUTF32LEEncoding.GetMaxByteCount(CharCount: Integer): Integer;
begin
  result := 4*CharCount;
end;

function TUTF32LEEncoding.GetMaxCharCount(ByteCount: Integer): Integer;
begin
  result := ByteCount div 4 + 1;
end;

function TUTF32LEEncoding.GetPreamble: TBytes;
begin
  SetLength(result, 4);
  result[0] := $FF;
  result[1] := $FE;
  result[2] := 0;
  result[3] := 0;
end;

{ TUTF32BEEncoding }

function TUTF32BEEncoding.GetBytes(Chars: PChar; CharCount: Integer;
  Bytes: PByte; ByteCount: Integer): Integer;
var
  i: Integer;
begin
  for i := 0 to CharCount - 1 do
  begin
    PCardinal(Bytes)^ := Swap32(cardinal(ord(Chars^)));
    inc(Chars);
    inc(Bytes, 4);
  end;
  result := 4*CharCount;
end;

function TUTF32BEEncoding.GetChars(Bytes: PByte; ByteCount: Integer;
  Chars: PChar; CharCount: Integer): Integer;
var
  i: Integer;
begin
  for i := 0 to CharCount - 1 do
  begin
    Chars^ := Char(Word(Swap32(PCardinal(Bytes)^)));
    inc(Chars);
    inc(Bytes, 4);
  end;
  result := ByteCount div 4;
end;

function TUTF32BEEncoding.GetPreamble: TBytes;
begin
  SetLength(result, 4);
  result[0] := 0;
  result[1] := 0;
  result[2] := $FE;
  result[3] := $FF;
end;

initialization
  UTF32LEEncoding := TUTF32LEEncoding.Create;
  UTF32BEEncoding := TUTF32BEEncoding.Create;

finalization
  UTF32LEEncoding.Free;
  UTF32BEEncoding.Free;

end.