NazaraEngine/build/scripts/actions/unicode.lua

local CategoryToString = {}
	CategoryToString["C"]  = "Category_Other"
	CategoryToString["Cc"] = "Category_Other_Control"
	CategoryToString["Cf"] = "Category_Other_Format"
	CategoryToString["Cn"] = "Category_Other_NotAssigned"
	CategoryToString["Co"] = "Category_Other_PrivateUse"
	CategoryToString["Cs"] = "Category_Other_Surrogate"
	CategoryToString["L"]  = "Category_Letter"
	CategoryToString["Ll"] = "Category_Letter_Lowercase"
	CategoryToString["Lm"] = "Category_Letter_Modifier"
	CategoryToString["Lo"] = "Category_Letter_Other"
	CategoryToString["Lt"] = "Category_Letter_Titlecase"
	CategoryToString["Lu"] = "Category_Letter_Uppercase"
	CategoryToString["M"]  = "Category_Mark"
	CategoryToString["Me"] = "Category_Mark_Enclosing"
	CategoryToString["Mn"] = "Category_Mark_NonSpacing"
	CategoryToString["Mc"] = "Category_Mark_SpacingCombining"
	CategoryToString["N"]  = "Category_Number"
	CategoryToString["Nd"] = "Category_Number_DecimalDigit"
	CategoryToString["Nl"] = "Category_Number_Letter"
	CategoryToString["No"] = "Category_Number_Other"
	CategoryToString["P"]  = "Category_Punctuation"
	CategoryToString["Pe"] = "Category_Punctuation_Close"
	CategoryToString["Pc"] = "Category_Punctuation_Connector"
	CategoryToString["Pd"] = "Category_Punctuation_Dash"
	CategoryToString["Pf"] = "Category_Punctuation_FinalQuote"
	CategoryToString["Pi"] = "Category_Punctuation_InitialQuote"
	CategoryToString["Ps"] = "Category_Punctuation_Open"
	CategoryToString["Po"] = "Category_Punctuation_Other"
	CategoryToString["S"]  = "Category_Symbol"
	CategoryToString["Sc"] = "Category_Symbol_Currency"
	CategoryToString["Sm"] = "Category_Symbol_Math"
	CategoryToString["Sk"] = "Category_Symbol_Modifier"
	CategoryToString["So"] = "Category_Symbol_Other"
	CategoryToString["Z"]  = "Category_Separator"
	CategoryToString["Zl"] = "Category_Separator_Line"
	CategoryToString["Zp"] = "Category_Separator_Paragraph"
	CategoryToString["Zs"] = "Category_Separator_Space"

local DirectionToString = {}
	DirectionToString["AL"]  = "Direction_Arabic_Letter"
	DirectionToString["AN"]  = "Direction_Arabic_Number"
	DirectionToString["BN"]  = "Direction_Boundary_Neutral"
	DirectionToString["CS"]  = "Direction_Common_Separator"
	DirectionToString["EN"]  = "Direction_European_Number"
	DirectionToString["ES"]  = "Direction_European_Separator"
	DirectionToString["ET"]  = "Direction_European_Terminator"
	DirectionToString["L"]	 = "Direction_Left_To_Right"
	DirectionToString["LRE"] = "Direction_Left_To_Right_Embedding"
	DirectionToString["LRO"] = "Direction_Left_To_Right_Override"
	DirectionToString["NSM"] = "Direction_Nonspacing_Mark"
	DirectionToString["ON"]	 = "Direction_Other_Neutral"
	DirectionToString["B"]	 = "Direction_Paragraph_Separator"
	DirectionToString["PDF"] = "Direction_Pop_Directional_Format"
	DirectionToString["R"]	 = "Direction_Right_To_Left"
	DirectionToString["RLE"] = "Direction_Right_To_Left_Embedding"
	DirectionToString["RLO"] = "Direction_Right_To_Left_Override"
	DirectionToString["S"]	 = "Direction_Segment_Separator"
	DirectionToString["WS"]  = "Direction_White_Space"

table.maxn = table.maxn or function (tab) -- Compatibilit<69> Lua 5.2
	local maxIndex = 0
	for k,v in pairs(tab) do
		if (k > maxIndex) then
			maxIndex = k
		end
	end
end

function getCharacter(tab, first, index)
	local character = {}
	character.Category  = CategoryToString[tab[3]] or "Category_NoCategory"
	character.Direction = DirectionToString[tab[5]] or error("Direction not recognized")
	character.LowerCase = (string.len(tab[14]) ~= 0 and (tonumber(tab[14], 16)-first)) or index
	character.UpperCase = (string.len(tab[13]) ~= 0 and (tonumber(tab[13], 16)-first)) or index
	character.TitleCase = (string.len(tab[15]) ~= 0 and (tonumber(tab[15], 16)-first)) or character.UpperCase

	return character
end

function parseUnicodeData()
	local unicodeSet = {}

	file = io.open ("scripts/data/UnicodeData.txt", "r")
	if (not file) then
		error("Unable to open Unicode Data file")
		return
	end

	local t1 = os.clock()
	print("Parsing UnicodeData.txt...")
	local first = 0
	local last = 0
	unicodeSet[0] = {}
	unicodeSet[0].First = 0
	unicodeSet[0].Characters = {}
	local currentSet = 0
	local inblock = false
	local blockData = nil
	local unusedIndex = 0
	local c = 0
	for line in file:lines() do
		local old = 0
		local start = string.find(line, ';', old)
		local tab = {}
		while (start) do
			tab[#tab+1] = string.sub(line, old, start-1, old)
			old = start+1
			start = string.find(line, ';', old)
		end
		tab[#tab+1] = string.sub(line, old)

		local index = tonumber(tab[1], 16)
		if (index > 0 and not inblock) then
			if (index-last > 1000) then
				unicodeSet[currentSet].Last = last
				currentSet = currentSet + 1
				unicodeSet[currentSet] = {}
				unicodeSet[currentSet].First = index
				unicodeSet[currentSet].Characters = {}
				print("Set detected (Begin at " .. first .. ", end at " .. last .. ")")
				first = index
			else
				unusedIndex = unusedIndex + index-last-1
			end
		end

		local blockName, blockId = string.match(tab[2], "<(.+), (%w+)>")
		if (blockName ~= nil and blockId ~= nil) then
			if (blockId == "First") then
				if (inblock) then
					error("Already in block (" .. tab[1] .. ")")
				end
				inblock = true
				blockCharacter = getCharacter(tab, first)
			elseif (blockId == "Last") then
				if (not inblock) then
					error("Not in block (" .. tab[1] .. ")")
				end
				inblock = false
				for i=first, index do
					unicodeSet[currentSet].Characters[i] = getCharacter(tab, first, i)
				end
			end
		end

		unicodeSet[currentSet].Characters[index - first] = getCharacter(tab, first, index)
		if (unicodeSet[currentSet].Characters[index - first].LowerCase ~= (index - first) or
			unicodeSet[currentSet].Characters[index - first].UpperCase ~= (index - first) or
			unicodeSet[currentSet].Characters[index - first].TitleCase ~= (index - first)) then
			c = c + 1
		end

		last = index
	end
	unicodeSet[currentSet].Last = last
	print("Set detected (Begin at " .. first .. ", end at " .. last .. ")")
	file:close()

	print("Parsed " .. last+1 .. " characters in " .. #unicodeSet .. " sets, " .. unusedIndex .. " unused indices (took " .. os.difftime(os.clock(), t1) .. " sec)")

	file = io.open("../src/Nazara/Core/UnicodeData.hpp", "w+")
	if (not file) then
		error("Unable to create Unicode Data header")
		return
	end

	print("Writting Unicode Data to header...")

	t1 = os.clock()
	for i=0, #unicodeSet do
		local maxn = table.maxn(unicodeSet[i].Characters)
		file:write(string.format("Character unicodeSet%d[%d] = {\n", i, maxn+1))

		for j=0, maxn do
			local v = unicodeSet[i].Characters[j]
			if (v) then
				file:write(string.format("\t{%s,%s,%d,%d,%d},\n", v.Category, v.Direction, v.LowerCase, v.TitleCase, v.UpperCase))
			else
				file:write(string.format("\t{Category_NoCategory,Direction_Boundary_Neutral,%d,%d,%d},\n", j, j, j))
			end
		end

		file:write("};\n\n")
	end
	file:close()

	print("Took " .. os.difftime(os.clock(), t1) .. "sec.")
end
--print(string.match("<Plane 15 Private Use, First>", "<.+, (%w+)>"))

newaction
{
	trigger     = "unicode",
	description = "Parse the Unicode Character Data and put the useful informations into a header",
	execute     = parseUnicodeData
}