Core: Integrated Unicode data

2018-08-03 20:24:56 +02:00
parent cf24b8abe4
commit 39d1d31639
5 changed files with 36780 additions and 109 deletions
--- a/src/Nazara/Core/Unicode.cpp
+++ b/src/Nazara/Core/Unicode.cpp
@@ -4,23 +4,140 @@

 #include <Nazara/Core/Unicode.hpp>
 #include <Nazara/Core/Config.hpp>
+#include <algorithm>
 #include <Nazara/Core/Debug.hpp>

 #if NAZARA_CORE_INCLUDE_UNICODEDATA
 namespace Nz
 {
-	struct Character
+	struct UnicodeCharacter
 	{
-		UInt16 category;	// The type of the character
-		UInt8	 direction;	// The reading way of the character
-		UInt32 lowerCase;	// The corresponding lower character
-		UInt32 titleCase;	// The corresponding title character
-		UInt32 upperCase;	// The corresponding upper character
+		UInt32 codepoint;
+		Unicode::Category category;  // The type of the character
+		Unicode::Direction direction; // The reading way of the character
+	};
+
+	struct UnicodeSet
+	{
+		UInt32 firstCodepoint;
+		UInt32 lastCodepoint;
+		UnicodeCharacter character;
+	};
+
+	struct UnicodeCharacterSimpleMapping
+	{
+		UInt32 codepoint;
+		UInt32 character;
 	};
-}

 #include <Nazara/Core/UnicodeData.hpp>

+	namespace
+	{
+		const UnicodeCharacter* GetCharacter(Nz::UInt32 codepoint)
+		{
+			auto it = std::lower_bound(std::begin(unicodeCharacters), std::end(unicodeCharacters), codepoint, [](const UnicodeCharacter& character, Nz::UInt32 codepoint) { return character.codepoint < codepoint; });
+			if (it != std::end(unicodeCharacters) && it->codepoint == codepoint)
+				return &*it;
+			else
+			{
+				// Character is not part of the common character array, search in set
+				auto itSet = std::lower_bound(std::begin(unicodeSets), std::end(unicodeSets), codepoint, [](const UnicodeSet& character, Nz::UInt32 codepoint) { return character.firstCodepoint < codepoint; });
+				if (itSet != std::begin(unicodeSets))
+				{
+					--itSet;
+					if (itSet != std::end(unicodeSets) && codepoint >= itSet->firstCodepoint && codepoint <= itSet->lastCodepoint)
+						return &itSet->character;
+				}
+			}
+
+			return nullptr;
+		}
+
+		template<std::size_t N>
+		const UnicodeCharacterSimpleMapping* GetCharacterMapping(Nz::UInt32 codepoint, const UnicodeCharacterSimpleMapping(&mapping)[N])
+		{
+			auto it = std::lower_bound(std::begin(mapping), std::end(mapping), codepoint, [](const UnicodeCharacterSimpleMapping& character, Nz::UInt32 codepoint) { return character.codepoint < codepoint; });
+			if (it != std::end(mapping) && it->codepoint == codepoint)
+				return &*it;
+			else
+				return nullptr;
+		}
+	}
+
+	/*!
+	* \brief Gets the category of the character
+	* \return Unicode category
+	*
+	* \param character Character to get assignated category
+	*/
+	Unicode::Category Unicode::GetCategory(char32_t character)
+	{
+		if (const UnicodeCharacter* characterData = GetCharacter(character))
+			return characterData->category;
+		else
+			return Category_NoCategory;
+	}
+
+	/*!
+	* \brief Gets the direction of reading of the character
+	* \return Unicode direction
+	*
+	* \param character Character to get assignated direction
+	*/
+
+	Unicode::Direction Unicode::GetDirection(char32_t character)
+	{
+		if (const UnicodeCharacter* characterData = GetCharacter(character))
+			return characterData->direction;
+		else
+			return Direction_Boundary_Neutral;
+	}
+
+	/*!
+	* \brief Gets the lower case of the character
+	* \return Unicode lower
+	*
+	* \param character Character to get assignated lower case
+	*/
+
+	char32_t Unicode::GetLowercase(char32_t character)
+	{
+		if (const UnicodeCharacterSimpleMapping* characterMapping = GetCharacterMapping(character, unicodeLower))
+			return characterMapping->character;
+		else
+			return character;
+	}
+
+	/*!
+	* \brief Gets the title case of the character
+	* \return Unicode title
+	*
+	* \param character Character to get assignated title case
+	*/
+	char32_t Unicode::GetTitlecase(char32_t character)
+	{
+		if (const UnicodeCharacterSimpleMapping* characterMapping = GetCharacterMapping(character, unicodeTitle))
+			return characterMapping->character;
+		else
+			return character;
+	}
+
+	/*!
+	* \brief Gets the upper case of the character
+	* \return Unicode upper
+	*
+	* \param character Character to get assignated upper case
+	*/
+	char32_t Unicode::GetUppercase(char32_t character)
+	{
+		if (const UnicodeCharacterSimpleMapping* characterMapping = GetCharacterMapping(character, unicodeUpper))
+			return characterMapping->character;
+		else
+			return character;
+	}
+}
+
 #else // Implementation handling ASCII table

 namespace Nz
--- a/src/Nazara/Core/UnicodeData.hpp
+++ b/src/Nazara/Core/UnicodeData.hpp