Core: Integrated Unicode data

2018-08-03 20:24:56 +02:00
parent cf24b8abe4
commit 39d1d31639
5 changed files with 36780 additions and 109 deletions
--- a/build/scripts/actions/unicode.lua
+++ b/build/scripts/actions/unicode.lua
@@ -48,15 +48,19 @@ local DirectionToString = {}
 	DirectionToString["EN"]  = "Direction_European_Number"
 	DirectionToString["ES"]  = "Direction_European_Separator"
 	DirectionToString["ET"]  = "Direction_European_Terminator"
 	DirectionToString["FSI"]  = "Direction_First_Strong_Isolate"
 	DirectionToString["L"]	 = "Direction_Left_To_Right"
 	DirectionToString["LRE"] = "Direction_Left_To_Right_Embedding"
 	DirectionToString["LRI"] = "Direction_Left_To_Right_Isolate"
 	DirectionToString["LRO"] = "Direction_Left_To_Right_Override"
 	DirectionToString["NSM"] = "Direction_Nonspacing_Mark"
 	DirectionToString["ON"]	 = "Direction_Other_Neutral"
 	DirectionToString["B"]	 = "Direction_Paragraph_Separator"
-	DirectionToString["PDF"] = "Direction_Pop_Directional_Format"
+	DirectionToString["PDF"] = "Direction_Pop_Directional_Formatting"
 	DirectionToString["PDI"] = "Direction_Pop_Directional_Isolate"
 	DirectionToString["R"]	 = "Direction_Right_To_Left"
 	DirectionToString["RLE"] = "Direction_Right_To_Left_Embedding"
 	DirectionToString["RLI"] = "Direction_Right_To_Left_Isolate"
 	DirectionToString["RLO"] = "Direction_Right_To_Left_Override"
 	DirectionToString["S"]	 = "Direction_Segment_Separator"
 	DirectionToString["WS"]  = "Direction_White_Space"
@@ -69,7 +73,7 @@ table.maxn = table.maxn or function (tab) -- Compatibilit
 		end
 	end
 end
-	
+
 local function getCharacter(tab, first, index)
 	local character = {}
 	character.Category  = CategoryToString[tab[3]] or "Category_NoCategory"
@@ -83,112 +87,179 @@ end
 ACTION.Function = function ()
 	local unicodeSet = {}
 	if (not os.isdir("scripts/data") and not os.mkdir("scripts/data")) then
 		print("Failed to create scripts/data folder")
 	end
-	file = io.open ("scripts/data/UnicodeData.txt", "r")
+	local filepath = "scripts/data/UnicodeData.txt"
 	print("Downloading UnicodeData.txt...")
 	local t1 = os.clock()
 	local result_str, response_code = http.download("https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt", filepath, {
 		headers = { "From: Premake", "Referer: Premake" }
 	})
 	if (response_code ~= 200) then
 		error("Failed to download UnicodeData.txt")
 	end
 	local fileInfo = os.stat(filepath)
 	local t2 = os.clock()
 	print(string.format("Download succeeded (%.3f MiB) in %fs (%d KiB/s)", fileInfo.size / (1024 * 1024), t2 - t1, math.floor((fileInfo.size / (t2 - t1)) / 1024)))
 	file = io.open (filepath, "r")
 	if (not file) then
 		error("Unable to open Unicode Data file")
 		return
 	end
-	local t1 = os.clock()
+	local characters = {}
 	local characterSets = {}
 	local lowercaseCharacters = {}
 	local titlecaseCharacters = {}
 	local uppercaseCharacters = {}
 	local currentBlock
 	local currentBlockStartCodepoint
 	local lineIndex = 1
 	t1 = os.clock()
 	print("Parsing UnicodeData.txt...")
 	local first = 0
 	local last = 0
 	unicodeSet[0] = {}
 	unicodeSet[0].First = 0
 	unicodeSet[0].Characters = {}
 	local currentSet = 0
 	local inblock = false
 	local blockData = nil
 	local unusedIndex = 0
 	local c = 0
 	for line in file:lines() do
-		local old = 0
+		local parts = line:explode(";")
-		local start = string.find(line, ';', old)
+
-		local tab = {}
+		local codepoint = tonumber(parts[1], 16)
-		while (start) do
+		local characterName = parts[2]
-			tab[#tab+1] = string.sub(line, old, start-1, old)
+		local category = parts[3]
-			old = start+1
+		local direction = parts[5]
-			start = string.find(line, ';', old)
+		local uppercaseMapping = tonumber(parts[13], 16)
-		end
+		local lowercaseMapping = tonumber(parts[14], 16)
-		tab[#tab+1] = string.sub(line, old)
+		local titlecaseMapping = tonumber(parts[15], 16)
-		
+
-		local index = tonumber(tab[1], 16)
+		local blockName, blockId = string.match(characterName, "<(.+), (%w+)>")
-		if (index > 0 and not inblock) then
+		if (currentBlock) then
-			if (index-last > 1000) then
+			if (blockId ~= "Last") then
-				unicodeSet[currentSet].Last = last
+				error("Parsing error: expected last block at line " .. lineIndex)
-				currentSet = currentSet + 1
+			end
-				unicodeSet[currentSet] = {}
+
-				unicodeSet[currentSet].First = index
+			print("Detected set " .. blockName .. " from codepoint " .. currentBlockStartCodepoint .. " to " .. codepoint)
-				unicodeSet[currentSet].Characters = {}
+
-				print("Set detected (Begin at " .. first .. ", end at " .. last .. ")")
+			table.insert(characterSets, {
-				first = index
+				startCodepoint = currentBlockStartCodepoint,
 				endCodepoint = codepoint,
 				name = "<" .. blockName .. ">",
 				category = category,
 				direction = direction
 			})
 			currentBlock = nil
 		else
 			if (blockName) then
 				if (blockId ~= "First") then
 					error("Parsing error: expected first block at line " .. lineIndex)
 				end
 				currentBlock = blockName
 				currentBlockStartCodepoint = codepoint
 			else
-				unusedIndex = unusedIndex + index-last-1
+				table.insert(characters, {
-			end
+					codepoint = codepoint,
-		end
+					name = characterName,
-		
+					category = category,
-		local blockName, blockId = string.match(tab[2], "<(.+), (%w+)>")
+					direction = direction,
-		if (blockName ~= nil and blockId ~= nil) then
+					upper = uppercaseMapping,
-			if (blockId == "First") then
+					lower = lowercaseMapping,
-				if (inblock) then
+					title = titlecaseMapping
-					error("Already in block (" .. tab[1] .. ")")
+				})
 				if (lowercaseMapping) then
 					table.insert(lowercaseCharacters, {codepoint = codepoint, lower = lowercaseMapping})
 				end
-				inblock = true
+
-				blockCharacter = getCharacter(tab, first)
+				if (titlecaseMapping) then
-			elseif (blockId == "Last") then
+					table.insert(titlecaseCharacters, {codepoint = codepoint, title = titlecaseMapping})
 				if (not inblock) then
 					error("Not in block (" .. tab[1] .. ")")
 				end
-				inblock = false
+
-				for i=first, index do
+				if (uppercaseMapping) then
-					unicodeSet[currentSet].Characters[i] = getCharacter(tab, first, i)
+					table.insert(uppercaseCharacters, {codepoint = codepoint, upper = uppercaseMapping})
 				end
 			end
 		end
-		unicodeSet[currentSet].Characters[index - first] = getCharacter(tab, first, index)
+		lineIndex = lineIndex + 1
 		if (unicodeSet[currentSet].Characters[index - first].LowerCase ~= (index - first) or 
 			unicodeSet[currentSet].Characters[index - first].UpperCase ~= (index - first) or
 			unicodeSet[currentSet].Characters[index - first].TitleCase ~= (index - first)) then
 			c = c + 1
 		end
 		last = index
 	end
 	unicodeSet[currentSet].Last = last
 	print("Set detected (Begin at " .. first .. ", end at " .. last .. ")")
 	file:close()
-	print("Parsed " .. last+1 .. " characters in " .. #unicodeSet .. " sets, " .. unusedIndex .. " unused indices (took " .. os.difftime(os.clock(), t1) .. " sec)")
+	t2 = os.clock()
 	print("Parsed " .. #characters .. " characters in " .. (t2 - t1) .. " seconds")
 	print("Writting Unicode Data to header...")
 	file = io.open("../src/Nazara/Core/UnicodeData.hpp", "w+")
 	if (not file) then
-		error("Unable to create Unicode Data header")
+		error("Failed to open Unicode Data header")
 		return
 	end
 	print("Writting Unicode Data to header...")
 	t1 = os.clock()
 	for i=0, #unicodeSet do
 		local maxn = table.maxn(unicodeSet[i].Characters)
 		file:write(string.format("Character unicodeSet%d[%d] = {\n", i, maxn+1))
-		for j=0, maxn do
+	file:write(string.format("UnicodeCharacter unicodeCharacters[%d] = {\n", #characters))
-			local v = unicodeSet[i].Characters[j]
+
-			if (v) then
+	for _, data in pairs(characters) do
-				file:write(string.format("\t{%s,%s,%d,%d,%d},\n", v.Category, v.Direction, v.LowerCase, v.TitleCase, v.UpperCase))
+		local category = CategoryToString[data.category]
-			else
+		if (not category) then
-				file:write(string.format("\t{Category_NoCategory,Direction_Boundary_Neutral,%d,%d,%d},\n", j, j, j))
+			error("Unknown category " .. data.category .. " for character " .. data.codepoint)
 			end
 		end
-		
+
-		file:write("};\n\n")
+		local direction = DirectionToString[data.direction]
 		if (not direction) then
 			error("Unknown direction " .. data.direction .. " for character " .. data.codepoint)
 		end
 		file:write(string.format("\t{%d, Unicode::%s, Unicode::%s},\n", data.codepoint, category, direction))
 	end
 	file:write("};\n\n")
 	file:write(string.format("UnicodeSet unicodeSets[%d] = {\n", #characterSets))
 	for _, data in pairs(characterSets) do
 		local category = CategoryToString[data.category]
 		if (not category) then
 			error("Unknown category " .. data.category .. " for character " .. data.codepoint)
 		end
 		local direction = DirectionToString[data.direction]
 		if (not direction) then
 			error("Unknown direction " .. data.direction .. " for character " .. data.codepoint)
 		end
 		file:write(string.format("\t{%d, %d, {%d, Unicode::%s, Unicode::%s}},\n", data.startCodepoint, data.endCodepoint, data.startCodepoint, category, direction))
 	end
 	file:write("};\n\n")
 	file:write(string.format("UnicodeCharacterSimpleMapping unicodeLower[%d] = {\n", #lowercaseCharacters))
 	for _, data in pairs(lowercaseCharacters) do
 		file:write(string.format("\t{%d, %d},\n", data.codepoint, data.lower))
 	end
 	file:write("};\n\n")
 	file:write(string.format("UnicodeCharacterSimpleMapping unicodeTitle[%d] = {\n", #titlecaseCharacters))
 	for _, data in pairs(titlecaseCharacters) do
 		file:write(string.format("\t{%d, %d},\n", data.codepoint, data.title))
 	end
 	file:write("};\n\n")
 	file:write(string.format("UnicodeCharacterSimpleMapping unicodeUpper[%d] = {\n", #uppercaseCharacters))
 	for _, data in pairs(uppercaseCharacters) do
 		file:write(string.format("\t{%d, %d},\n", data.codepoint, data.upper))
 	end
 	file:write("};\n\n")
 	file:close()
-	print("Took " .. os.difftime(os.clock(), t1) .. "sec.")
+	print("Succeeded in " .. (os.clock() - t1) .. "sec.")
 end
 --print(string.match("<Plane 15 Private Use, First>", "<.+, (%w+)>"))
--- a/include/Nazara/Core/Config.hpp
+++ b/include/Nazara/Core/Config.hpp
@@ -50,7 +50,7 @@
 #define NAZARA_CORE_FILE_BUFFERSIZE 4096
 // Incorporate the Unicode Character Data table (Necessary to make it work with the flag String::HandleUTF8)
-#define NAZARA_CORE_INCLUDE_UNICODEDATA 0
+#define NAZARA_CORE_INCLUDE_UNICODEDATA 1
 // Use the MemoryManager to manage dynamic allocations (can detect memory leak but allocations/frees are slower)
 #define NAZARA_CORE_MANAGE_MEMORY 0
--- a/include/Nazara/Core/Unicode.hpp
+++ b/include/Nazara/Core/Unicode.hpp
@@ -55,7 +55,7 @@ namespace Nz
 				Category_Other_PrivateUse		  = Category_Other | 0x0800,	   // Co
 				Category_Other_Surrogate		  = Category_Other | 0x1000,	   // Cs
-				// Ponctuations
+				// Punctuations
 				Category_Punctuation			  = 0x10,						   // P
 				Category_Punctuation_Close		  = Category_Punctuation | 0x0100, // Pe
 				Category_Punctuation_Connector	  = Category_Punctuation | 0x0200, // Pc
@@ -81,25 +81,29 @@ namespace Nz
 			enum Direction : UInt8
 			{
-				Direction_Arabic_Letter,		   // AL
+				Direction_Arabic_Letter,              // AL
-				Direction_Arabic_Number,		   // AN
+				Direction_Arabic_Number,              // AN
-				Direction_Boundary_Neutral,		   // BN
+				Direction_Boundary_Neutral,           // BN
-				Direction_Common_Separator,		   // CS
+				Direction_Common_Separator,           // CS
-				Direction_European_Number,		   // EN
+				Direction_European_Number,            // EN
-				Direction_European_Separator,	   // ES
+				Direction_European_Separator,         // ES
-				Direction_European_Terminator,	   // ET
+				Direction_European_Terminator,        // ET
-				Direction_Left_To_Right,		   // L
+				Direction_First_Strong_Isolate,       // FSI
-				Direction_Left_To_Right_Embedding, // LRE
+				Direction_Left_To_Right,              // L
-				Direction_Left_To_Right_Override,  // LRO
+				Direction_Left_To_Right_Embedding,    // LRE
-				Direction_Nonspacing_Mark,		   // NSM
+				Direction_Left_To_Right_Isolate,      // LRI
-				Direction_Other_Neutral,		   // ON
+				Direction_Left_To_Right_Override,     // LRO
-				Direction_Paragraph_Separator,	   // B
+				Direction_Nonspacing_Mark,            // NSM
-				Direction_Pop_Directional_Format,  // PDF
+				Direction_Other_Neutral,              // ON
-				Direction_Right_To_Left,		   // R
+				Direction_Paragraph_Separator,        // B
-				Direction_Right_To_Left_Embedding, // RLE
+				Direction_Pop_Directional_Formatting, // PDF
-				Direction_Right_To_Left_Override,  // RLO
+				Direction_Pop_Directional_Isolate,    // PDI
-				Direction_Segment_Separator,	   // S
+				Direction_Right_To_Left,              // R
-				Direction_White_Space			   // WS
+				Direction_Right_To_Left_Embedding,    // RLE
 				Direction_Right_To_Left_Isolate,      // RLI
 				Direction_Right_To_Left_Override,     // RLO
 				Direction_Segment_Separator,          // S
 				Direction_White_Space                 // WS
 			};
 			static Category GetCategory(char32_t character);
--- a/src/Nazara/Core/Unicode.cpp
+++ b/src/Nazara/Core/Unicode.cpp
@@ -4,23 +4,140 @@
 #include <Nazara/Core/Unicode.hpp>
 #include <Nazara/Core/Config.hpp>
 #include <algorithm>
 #include <Nazara/Core/Debug.hpp>
 #if NAZARA_CORE_INCLUDE_UNICODEDATA
 namespace Nz
 {
-	struct Character
+	struct UnicodeCharacter
 	{
-		UInt16 category;	// The type of the character
+		UInt32 codepoint;
-		UInt8	 direction;	// The reading way of the character
+		Unicode::Category category;  // The type of the character
-		UInt32 lowerCase;	// The corresponding lower character
+		Unicode::Direction direction; // The reading way of the character
-		UInt32 titleCase;	// The corresponding title character
+	};
-		UInt32 upperCase;	// The corresponding upper character
+
 	struct UnicodeSet
 	{
 		UInt32 firstCodepoint;
 		UInt32 lastCodepoint;
 		UnicodeCharacter character;
 	};
 	struct UnicodeCharacterSimpleMapping
 	{
 		UInt32 codepoint;
 		UInt32 character;
 	};
 }
 #include <Nazara/Core/UnicodeData.hpp>
 	namespace
 	{
 		const UnicodeCharacter* GetCharacter(Nz::UInt32 codepoint)
 		{
 			auto it = std::lower_bound(std::begin(unicodeCharacters), std::end(unicodeCharacters), codepoint, [](const UnicodeCharacter& character, Nz::UInt32 codepoint) { return character.codepoint < codepoint; });
 			if (it != std::end(unicodeCharacters) && it->codepoint == codepoint)
 				return &*it;
 			else
 			{
 				// Character is not part of the common character array, search in set
 				auto itSet = std::lower_bound(std::begin(unicodeSets), std::end(unicodeSets), codepoint, [](const UnicodeSet& character, Nz::UInt32 codepoint) { return character.firstCodepoint < codepoint; });
 				if (itSet != std::begin(unicodeSets))
 				{
 					--itSet;
 					if (itSet != std::end(unicodeSets) && codepoint >= itSet->firstCodepoint && codepoint <= itSet->lastCodepoint)
 						return &itSet->character;
 				}
 			}
 			return nullptr;
 		}
 		template<std::size_t N>
 		const UnicodeCharacterSimpleMapping* GetCharacterMapping(Nz::UInt32 codepoint, const UnicodeCharacterSimpleMapping(&mapping)[N])
 		{
 			auto it = std::lower_bound(std::begin(mapping), std::end(mapping), codepoint, [](const UnicodeCharacterSimpleMapping& character, Nz::UInt32 codepoint) { return character.codepoint < codepoint; });
 			if (it != std::end(mapping) && it->codepoint == codepoint)
 				return &*it;
 			else
 				return nullptr;
 		}
 	}
 	/*!
 	* \brief Gets the category of the character
 	* \return Unicode category
 	*
 	* \param character Character to get assignated category
 	*/
 	Unicode::Category Unicode::GetCategory(char32_t character)
 	{
 		if (const UnicodeCharacter* characterData = GetCharacter(character))
 			return characterData->category;
 		else
 			return Category_NoCategory;
 	}
 	/*!
 	* \brief Gets the direction of reading of the character
 	* \return Unicode direction
 	*
 	* \param character Character to get assignated direction
 	*/
 	Unicode::Direction Unicode::GetDirection(char32_t character)
 	{
 		if (const UnicodeCharacter* characterData = GetCharacter(character))
 			return characterData->direction;
 		else
 			return Direction_Boundary_Neutral;
 	}
 	/*!
 	* \brief Gets the lower case of the character
 	* \return Unicode lower
 	*
 	* \param character Character to get assignated lower case
 	*/
 	char32_t Unicode::GetLowercase(char32_t character)
 	{
 		if (const UnicodeCharacterSimpleMapping* characterMapping = GetCharacterMapping(character, unicodeLower))
 			return characterMapping->character;
 		else
 			return character;
 	}
 	/*!
 	* \brief Gets the title case of the character
 	* \return Unicode title
 	*
 	* \param character Character to get assignated title case
 	*/
 	char32_t Unicode::GetTitlecase(char32_t character)
 	{
 		if (const UnicodeCharacterSimpleMapping* characterMapping = GetCharacterMapping(character, unicodeTitle))
 			return characterMapping->character;
 		else
 			return character;
 	}
 	/*!
 	* \brief Gets the upper case of the character
 	* \return Unicode upper
 	*
 	* \param character Character to get assignated upper case
 	*/
 	char32_t Unicode::GetUppercase(char32_t character)
 	{
 		if (const UnicodeCharacterSimpleMapping* characterMapping = GetCharacterMapping(character, unicodeUpper))
 			return characterMapping->character;
 		else
 			return character;
 	}
 }
 #else // Implementation handling ASCII table
 namespace Nz
--- a/src/Nazara/Core/UnicodeData.hpp
+++ b/src/Nazara/Core/UnicodeData.hpp