------------------------------------------------------------------------------ -- XML/Ada - An XML suite for Ada95 -- -- -- -- Copyright (C) 2001-2012, AdaCore -- -- -- -- This library is free software; you can redistribute it and/or modify it -- -- under terms of the GNU General Public License as published by the Free -- -- Software Foundation; either version 3, or (at your option) any later -- -- version. This library is distributed in the hope that it will be useful, -- -- but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHAN- -- -- TABILITY or FITNESS FOR A PARTICULAR PURPOSE. -- -- -- -- -- -- -- -- -- -- -- -- You should have received a copy of the GNU General Public License and -- -- a copy of the GCC Runtime Library Exception along with this program; -- -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- -- . -- -- -- ------------------------------------------------------------------------------ with Unicode.CES.Utf32; use Unicode.CES.Utf32; with Unicode.CCS; use Unicode.CCS; package body Unicode.CES.Utf8 is function Internal_Convert (Str : utf8_string; Convert : Unicode.CCS.conversion_function := Identity'access; Order : byte_order := Default_Byte_Order) return utf8_string; -- Internal function used to convert character sets ----------- -- Width -- ----------- function Width (Char : unicode_char) return Natural is begin if Char < 16#80# then return 1; elsif Char < 16#800# then return 2; elsif Char < 16#10000# then return 3; elsif Char < 16#200000# then return 4; elsif Char < 16#4000000# then return 5; else return 6; end if; end Width; ------------ -- Encode -- ------------ procedure Encode (Char : unicode_char; Output : in out byte_sequence; Index : in out Natural) is Len : Natural; Mask : unicode_char; Val : unicode_char := Char; First, Last : Natural; begin if Char < 16#80# then Len := 1; Mask := 16#0#; elsif Char < 16#800# then Len := 2; Mask := 16#C0#; elsif Char < 16#10000# then Len := 3; Mask := 16#E0#; elsif Char < 16#200000# then Len := 4; Mask := 16#F0#; elsif Char < 16#4000000# then Len := 5; Mask := 16#F8#; else Len := 6; Mask := 16#FC#; end if; First := Index + 2; Last := Index + Len; for J in reverse First .. Last loop Output (J) := Character'val ((Val and 16#3f#) or 16#80#); Val := Val / (2**6); end loop; Output (Index + 1) := Character'val (Val or Mask); Index := Last; end Encode; ---------- -- Read -- ---------- procedure Read (Str : utf8_string; Index : in out Positive; Char : out unicode_char) is Len : Natural; Val : unicode_char; C : unicode_char := Character'pos (Str (Index)); begin -- Compute the length of the encoding given what was in the first byte if C < 128 then Len := Index; Val := C and 16#7f#; elsif (C and 16#E0#) = 16#C0# then Len := Index + 1; Val := C and 16#1f#; elsif (C and 16#F0#) = 16#E0# then Len := Index + 2; Val := C and 16#0f#; elsif (C and 16#F8#) = 16#F0# then Len := Index + 3; Val := C and 16#07#; elsif (C and 16#FC#) = 16#F8# then Len := Index + 4; Val := C and 16#03#; elsif (C and 16#FE#) = 16#FC# then Len := Index + 5; Val := C and 16#01#; else raise Invalid_Encoding; end if; for Count in Index + 1 .. Natural'min (Len, Str'last) loop C := Character'pos (Str (Count)); if (C and 16#C0#) /= 16#80# then raise Invalid_Encoding; end if; Val := (Val * (2**6)) or (C and 16#3f#); end loop; if Str'last < Len then raise Incomplete_Encoding; end if; Index := Len + 1; Char := Val; end Read; ------------ -- Length -- ------------ function Length (Str : utf8_string) return Natural is Pos : Natural := Str'first; Length : Natural := 0; C : unicode_char; begin while Pos <= Str'last loop Read (Str, Pos, C); Length := Length + 1; end loop; return Length; end Length; ---------------- -- From_Utf32 -- ---------------- function From_Utf32 (Str : Unicode.CES.Utf32.utf32_string) return utf8_string is -- Allocate worst case Result : utf8_string (1 .. (Str'length / utf32_char_width) * 6); J : Positive := Str'first; R_Index : Natural := Result'first - 1; C : unicode_char; begin while J <= Str'last loop Unicode.CES.Utf32.Read (Str, J, C); Encode (C, Result, R_Index); end loop; return Result (1 .. R_Index); end From_Utf32; -------------- -- To_Utf32 -- -------------- function To_Utf32 (Str : utf8_string) return Unicode.CES.Utf32.utf32_le_string is -- Allocate worst case Result : utf32_le_string (1 .. Str'length * utf32_char_width); J : Positive := Str'first; R_Index : Natural := Result'first - 1; C : unicode_char; begin while J <= Str'last loop Read (Str, J, C); Unicode.CES.Utf32.Encode (C, Result, R_Index); end loop; return Result (1 .. R_Index); end To_Utf32; ---------------------- -- Internal_Convert -- ---------------------- function Internal_Convert (Str : utf8_string; Convert : Unicode.CCS.conversion_function := Identity'access; Order : byte_order := Default_Byte_Order) return utf8_string is pragma warnings (Off, Order); Offset : Natural := 0; BOM : bom_type; C : unicode_char; J : Natural; begin Read_Bom (Str, Offset, BOM); case BOM is when utf8_all | unknown => null; when others => raise Invalid_Encoding; end case; if Convert = Identity'access then return Str (Str'first + Offset .. Str'last); else declare -- Allocate worst case for the string Result : utf8_string (1 .. Str'length * 6); R_Index : Natural := Result'first - 1; begin J := Str'first + Offset; while J <= Str'last loop Read (Str, J, C); Encode (Convert (C), Result, R_Index); end loop; return Result (1 .. R_Index); end; end if; end Internal_Convert; ------------------- -- To_Unicode_LE -- ------------------- function To_Unicode_LE (Str : utf8_string; Cs : Unicode.CCS.character_set := Unicode.CCS.Unicode_Character_Set; Order : byte_order := Default_Byte_Order) return utf8_string is begin return Internal_Convert (Str, Cs.To_Unicode, Order); end To_Unicode_LE; ----------- -- To_CS -- ----------- function To_CS (Str : utf8_string; Cs : Unicode.CCS.character_set := Unicode.CCS.Unicode_Character_Set; Order : byte_order := Default_Byte_Order) return utf8_string is begin return Internal_Convert (Str, Cs.To_CS, Order); end To_CS; end Unicode.CES.Utf8;