fix: Optimized UTF8, BiDi, Debug

2026-01-20 21:23:31 +00:00
parent ba79e137f0
commit 100dd2b87b
5 changed files with 817 additions and 381 deletions
--- a/extern/include/utf8.h
+++ b/extern/include/utf8.h
@@ -1,9 +1,11 @@
 #pragma once
 #include <string>
+#include <cstring>
 #include <windows.h>
 #include <vector>
 #include <algorithm>
 #include <cmath>
+#include <utility>

 #include <EterLocale/Arabic.h>

@@ -37,6 +39,161 @@ constexpr size_t ARABIC_SHAPING_SAFETY_MARGIN_RETRY = 64;
 	#define BIDI_LOG_SIMPLE(msg) ((void)0)
 #endif

+// ============================================================================
+// OPTIMIZED CHARACTER CLASSIFICATION (Lookup Tables)
+// ============================================================================
+// Replaces expensive GetStringTypeW() syscalls with O(1) table lookups.
+// Tables are initialized once on first use (thread-safe via static init).
+
+namespace BiDiTables
+{
+	// Character property flags
+	enum ECharFlags : uint8_t
+	{
+		CF_NONE   = 0,
+		CF_ALPHA  = 0x01,  // Alphabetic (Latin, Cyrillic, Greek, etc.)
+		CF_DIGIT  = 0x02,  // Numeric digit (0-9, Arabic-Indic, etc.)
+		CF_RTL    = 0x04,  // RTL script (Arabic, Hebrew)
+		CF_ARABIC = 0x08,  // Arabic letter that needs shaping
+	};
+
+	// Main character flags table (65536 entries for BMP)
+	inline const uint8_t* GetCharFlagsTable()
+	{
+		static uint8_t s_table[65536] = {0};
+		static bool s_initialized = false;
+
+		if (!s_initialized)
+		{
+			// ASCII digits
+			for (int i = '0'; i <= '9'; ++i)
+				s_table[i] |= CF_DIGIT;
+
+			// ASCII letters
+			for (int i = 'A'; i <= 'Z'; ++i)
+				s_table[i] |= CF_ALPHA;
+			for (int i = 'a'; i <= 'z'; ++i)
+				s_table[i] |= CF_ALPHA;
+
+			// Latin Extended-A/B (0x0100-0x024F)
+			for (int i = 0x0100; i <= 0x024F; ++i)
+				s_table[i] |= CF_ALPHA;
+
+			// Latin Extended Additional (0x1E00-0x1EFF)
+			for (int i = 0x1E00; i <= 0x1EFF; ++i)
+				s_table[i] |= CF_ALPHA;
+
+			// Greek (0x0370-0x03FF)
+			for (int i = 0x0370; i <= 0x03FF; ++i)
+				s_table[i] |= CF_ALPHA;
+
+			// Cyrillic (0x0400-0x04FF)
+			for (int i = 0x0400; i <= 0x04FF; ++i)
+				s_table[i] |= CF_ALPHA;
+
+			// Hebrew (0x0590-0x05FF) - RTL
+			for (int i = 0x0590; i <= 0x05FF; ++i)
+				s_table[i] |= CF_RTL | CF_ALPHA;
+
+			// Arabic (0x0600-0x06FF) - RTL + needs shaping
+			for (int i = 0x0600; i <= 0x06FF; ++i)
+				s_table[i] |= CF_RTL | CF_ALPHA;
+			// Arabic letters that need shaping (0x0621-0x064A)
+			for (int i = 0x0621; i <= 0x064A; ++i)
+				s_table[i] |= CF_ARABIC;
+
+			// Arabic Supplement (0x0750-0x077F)
+			for (int i = 0x0750; i <= 0x077F; ++i)
+				s_table[i] |= CF_RTL | CF_ALPHA;
+
+			// Arabic Extended-A (0x08A0-0x08FF)
+			for (int i = 0x08A0; i <= 0x08FF; ++i)
+				s_table[i] |= CF_RTL | CF_ALPHA;
+
+			// Arabic-Indic digits (0x0660-0x0669)
+			for (int i = 0x0660; i <= 0x0669; ++i)
+				s_table[i] |= CF_DIGIT;
+
+			// Extended Arabic-Indic digits (0x06F0-0x06F9)
+			for (int i = 0x06F0; i <= 0x06F9; ++i)
+				s_table[i] |= CF_DIGIT;
+
+			// Arabic Presentation Forms-A (0xFB50-0xFDFF) - already shaped
+			for (int i = 0xFB50; i <= 0xFDFF; ++i)
+				s_table[i] |= CF_RTL | CF_ALPHA;
+
+			// Arabic Presentation Forms-B (0xFE70-0xFEFF) - already shaped
+			for (int i = 0xFE70; i <= 0xFEFF; ++i)
+				s_table[i] |= CF_RTL | CF_ALPHA;
+
+			// Hebrew presentation forms (0xFB1D-0xFB4F)
+			for (int i = 0xFB1D; i <= 0xFB4F; ++i)
+				s_table[i] |= CF_RTL | CF_ALPHA;
+
+			// CJK (0x4E00-0x9FFF) - treat as LTR alpha
+			for (int i = 0x4E00; i <= 0x9FFF; ++i)
+				s_table[i] |= CF_ALPHA;
+
+			// Hangul (0xAC00-0xD7AF)
+			for (int i = 0xAC00; i <= 0xD7AF; ++i)
+				s_table[i] |= CF_ALPHA;
+
+			// RTL marks and controls
+			s_table[0x200F] |= CF_RTL; // RLM
+			s_table[0x061C] |= CF_RTL; // ALM
+			for (int i = 0x202B; i <= 0x202E; ++i)
+				s_table[i] |= CF_RTL; // RLE/RLO/PDF/LRE/LRO
+			for (int i = 0x2066; i <= 0x2069; ++i)
+				s_table[i] |= CF_RTL; // Isolates
+
+			s_initialized = true;
+		}
+
+		return s_table;
+	}
+
+	// Fast O(1) character classification functions
+	inline bool IsRTL(wchar_t ch) { return GetCharFlagsTable()[(uint16_t)ch] & CF_RTL; }
+	inline bool IsAlpha(wchar_t ch) { return GetCharFlagsTable()[(uint16_t)ch] & CF_ALPHA; }
+	inline bool IsDigit(wchar_t ch) { return GetCharFlagsTable()[(uint16_t)ch] & CF_DIGIT; }
+	inline bool IsArabicLetter(wchar_t ch) { return GetCharFlagsTable()[(uint16_t)ch] & CF_ARABIC; }
+	inline bool IsStrongLTR(wchar_t ch)
+	{
+		uint8_t flags = GetCharFlagsTable()[(uint16_t)ch];
+		// Strong LTR = (Alpha OR Digit) AND NOT RTL
+		return (flags & (CF_ALPHA | CF_DIGIT)) && !(flags & CF_RTL);
+	}
+}
+
+// ============================================================================
+// BUFFER POOLING (Avoid per-call allocations)
+// ============================================================================
+
+namespace BiDiBuffers
+{
+	struct TBufferPool
+	{
+		std::vector<wchar_t> shaped;
+
+		void EnsureCapacity(size_t n)
+		{
+			size_t needed = n * 2 + 64;
+			if (shaped.capacity() < needed) shaped.reserve(needed);
+		}
+
+		void Clear()
+		{
+			shaped.clear();
+		}
+	};
+
+	inline TBufferPool& Get()
+	{
+		thread_local static TBufferPool s_pool;
+		return s_pool;
+	}
+}
+
 // ============================================================================
 // UNICODE VALIDATION HELPERS
 // ============================================================================
@@ -65,7 +222,70 @@ static inline void SanitizeWideString(std::wstring& ws)
 		ws.end());
 }

+// ============================================================================
+// OPTIMIZED UTF-8 CONVERSION
+// ============================================================================
+// Fast paths for ASCII-only text (very common in games).
+// Falls back to Windows API for non-ASCII.
+
+namespace Utf8Fast
+{
+	// Check if string is pure ASCII (no bytes >= 128)
+	inline bool IsAsciiOnly(const char* s, size_t len)
+	{
+		// Process 8 bytes at a time for speed
+		const char* end = s + len;
+		const char* aligned_end = s + (len & ~7);
+
+		while (s < aligned_end)
+		{
+			// Check 8 bytes at once using bitwise OR
+			uint64_t chunk;
+			memcpy(&chunk, s, 8);
+			if (chunk & 0x8080808080808080ULL)
+				return false;
+			s += 8;
+		}
+
+		// Check remaining bytes
+		while (s < end)
+		{
+			if ((unsigned char)*s >= 128)
+				return false;
+			++s;
+		}
+		return true;
+	}
+
+	// Fast ASCII-only conversion (no API calls)
+	inline std::wstring AsciiToWide(const char* s, size_t len)
+	{
+		std::wstring out;
+		out.reserve(len);
+		for (size_t i = 0; i < len; ++i)
+			out.push_back(static_cast<wchar_t>(static_cast<unsigned char>(s[i])));
+		return out;
+	}
+
+	// Fast ASCII-only conversion (no API calls)
+	inline std::string WideToAscii(const wchar_t* ws, size_t len)
+	{
+		std::string out;
+		out.reserve(len);
+		for (size_t i = 0; i < len; ++i)
+		{
+			wchar_t ch = ws[i];
+			if (ch < 128)
+				out.push_back(static_cast<char>(ch));
+			else
+				return ""; // Not pure ASCII, caller should use full conversion
+		}
+		return out;
+	}
+}
+
 // UTF-8 -> UTF-16 (Windows wide)
+// OPTIMIZED: Fast path for ASCII-only strings (avoids 2x API calls)
 inline std::wstring Utf8ToWide(const std::string& s)
 {
 	if (s.empty())
@@ -75,9 +295,14 @@ inline std::wstring Utf8ToWide(const std::string& s)
 	if (s.size() > MAX_TEXT_LENGTH || s.size() > INT_MAX)
 	{
 		BIDI_LOG("Utf8ToWide: String too large (%zu bytes)", s.size());
-		return L""; // String too large
+		return L"";
 	}

+	// Fast path: ASCII-only strings (very common in games)
+	if (Utf8Fast::IsAsciiOnly(s.data(), s.size()))
+		return Utf8Fast::AsciiToWide(s.data(), s.size());
+
+	// Slow path: Use Windows API for non-ASCII
 	int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), (int)s.size(), nullptr, 0);
 	if (wlen <= 0)
 	{
@@ -90,29 +315,31 @@ inline std::wstring Utf8ToWide(const std::string& s)
 	if (written <= 0 || written != wlen)
 	{
 		BIDI_LOG("Utf8ToWide: Second conversion failed (written=%d, expected=%d, error=%d)", written, wlen, GetLastError());
-		return L""; // Conversion failed unexpectedly
+		return L"";
 	}

-	// Optional: Sanitize to remove invalid Unicode codepoints (surrogates, non-characters)
-	// Uncomment if you want strict validation
-	// SanitizeWideString(out);
-
 	return out;
 }

 // Convenience overload for char*
+// OPTIMIZED: Fast path for ASCII-only strings
 inline std::wstring Utf8ToWide(const char* s)
 {
 	if (!s || !*s)
 		return L"";

+	size_t len = strlen(s);
+
+	// Fast path: ASCII-only strings
+	if (Utf8Fast::IsAsciiOnly(s, len))
+		return Utf8Fast::AsciiToWide(s, len);
+
+	// Slow path: Use Windows API
 	int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s, -1, nullptr, 0);
 	if (wlen <= 0)
 		return L"";

-	// wlen includes terminating NUL
 	std::wstring out(wlen, L'\0');
-
 	int written = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s, -1, out.data(), wlen);
 	if (written <= 0 || written != wlen)
 	{
@@ -124,13 +351,11 @@ inline std::wstring Utf8ToWide(const char* s)
 	if (!out.empty() && out.back() == L'\0')
 		out.pop_back();

-	// Optional: Sanitize to remove invalid Unicode codepoints
-	// SanitizeWideString(out);
-
 	return out;
 }

 // UTF-16 (Windows wide) -> UTF-8
+// OPTIMIZED: Fast path for ASCII-only strings
 inline std::string WideToUtf8(const std::wstring& ws)
 {
 	if (ws.empty())
@@ -138,8 +363,23 @@ inline std::string WideToUtf8(const std::wstring& ws)

 	// Validate size limits (prevent DoS and INT_MAX overflow)
 	if (ws.size() > MAX_TEXT_LENGTH || ws.size() > INT_MAX)
-		return ""; // String too large
+		return "";

+	// Fast path: Check if all characters are ASCII
+	bool isAscii = true;
+	for (size_t i = 0; i < ws.size() && isAscii; ++i)
+		isAscii = (ws[i] < 128);
+
+	if (isAscii)
+	{
+		std::string out;
+		out.reserve(ws.size());
+		for (size_t i = 0; i < ws.size(); ++i)
+			out.push_back(static_cast<char>(ws[i]));
+		return out;
+	}
+
+	// Slow path: Use Windows API for non-ASCII
 	int len = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, ws.data(), (int)ws.size(), nullptr, 0, nullptr, nullptr);
 	if (len <= 0)
 		return "";
@@ -149,7 +389,7 @@ inline std::string WideToUtf8(const std::wstring& ws)
 	if (written <= 0 || written != len)
 	{
 		BIDI_LOG("WideToUtf8: Conversion failed (written=%d, expected=%d, error=%d)", written, len, GetLastError());
-		return ""; // Conversion failed
+		return "";
 	}
 	return out;
 }
@@ -169,59 +409,22 @@ inline std::string WideToUtf8(const wchar_t* ws)
 enum class EBidiDir { LTR, RTL };
 enum class ECharDir : unsigned char { Neutral, LTR, RTL };

-struct TBidiRun
-{
-	EBidiDir dir;
-	std::vector<wchar_t> text; // logical order
-};
-
+// Optimized: O(1) lookup table instead of GetStringTypeW() syscalls
 static inline bool IsRTLCodepoint(wchar_t ch)
 {
-	// Directional marks / isolates / embeddings that affect bidi
-	if (ch == 0x200F || ch == 0x061C) return true; // RLM, ALM
-	if (ch >= 0x202B && ch <= 0x202E) return true; // RLE/RLO/PDF/LRE/LRO
-	if (ch >= 0x2066 && ch <= 0x2069) return true; // isolates
-
-	// Hebrew + Arabic blocks (BMP)
-	if (ch >= 0x0590 && ch <= 0x08FF) return true;
-
-	// Presentation forms
-	if (ch >= 0xFB1D && ch <= 0xFDFF) return true;
-	if (ch >= 0xFE70 && ch <= 0xFEFF) return true;
-
-	return false;
+	return BiDiTables::IsRTL(ch);
 }

+// Optimized: O(1) lookup table instead of GetStringTypeW() syscalls
 static inline bool IsStrongAlpha(wchar_t ch)
 {
-	// Use thread-local cache for BMP (Thread safety)
-	thread_local static unsigned char cache[65536] = {}; // 0=unknown, 1=true, 2=false
-	unsigned char& v = cache[(unsigned short)ch];
-	if (v == 1) return true;
-	if (v == 2) return false;
-
-	WORD type = 0;
-	bool ok = GetStringTypeW(CT_CTYPE1, &ch, 1, &type) && (type & C1_ALPHA);
-	v = ok ? 1 : 2;
-	return ok;
+	return BiDiTables::IsAlpha(ch);
 }

+// Optimized: O(1) lookup table instead of GetStringTypeW() syscalls
 static inline bool IsDigit(wchar_t ch)
 {
-	// Fast path for ASCII digits (90%+ of digit checks)
-	if (ch >= L'0' && ch <= L'9')
-		return true;
-
-	// For non-ASCII, use cache (Arabic-Indic digits, etc.)
-	thread_local static unsigned char cache[65536] = {}; // 0=unknown, 1=true, 2=false
-	unsigned char& v = cache[(unsigned short)ch];
-	if (v == 1) return true;
-	if (v == 2) return false;
-
-	WORD type = 0;
-	bool ok = GetStringTypeW(CT_CTYPE1, &ch, 1, &type) && (type & C1_DIGIT);
-	v = ok ? 1 : 2;
-	return ok;
+	return BiDiTables::IsDigit(ch);
 }

 static inline bool IsNameTokenPunct(wchar_t ch)
@@ -257,12 +460,10 @@ static inline bool IsNameTokenPunct(wchar_t ch)
 	}
 }

-// Check RTL first to avoid classifying Arabic as LTR
+// Optimized: O(1) lookup - Check RTL first to avoid classifying Arabic as LTR
 static inline bool IsStrongLTR(wchar_t ch)
 {
-	if (IsRTLCodepoint(ch))
-		return false;
-	return IsStrongAlpha(ch) || IsDigit(ch);
+	return BiDiTables::IsStrongLTR(ch);
 }

 static inline bool HasStrongLTRNeighbor(const wchar_t* s, int n, int i)
@@ -561,33 +762,29 @@ static std::vector<wchar_t> BuildVisualBidiText_Tagless(const wchar_t* s, int n,
 	if (!s || n <= 0)
 		return {};

+	// Use buffer pool to avoid per-call allocations
+	BiDiBuffers::TBufferPool& buffers = BiDiBuffers::Get();
+	buffers.EnsureCapacity((size_t)n);
+
 	// 1) base direction
 	EBidiDir base = forceRTL ? EBidiDir::RTL : DetectBaseDir_FirstStrong(s, n);

 	// Pre-compute strong character positions for O(1) neutral resolution
 	TStrongDirCache strongCache(s, n, base);

-	// 2) split into runs
-	// Estimate runs based on text length (~1 per 50 chars, min 4)
-	std::vector<TBidiRun> runs;
-	const size_t estimatedRuns = (size_t)std::max(4, n / 50);
-	runs.reserve(estimatedRuns);
-
-	auto push_run = [&](EBidiDir d)
-		{
-			if (runs.empty() || runs.back().dir != d)
-				runs.push_back(TBidiRun{ d, {} });
-		};
-
-	// start with base so leading neutrals attach predictably
-	push_run(base);
+	// 2) split into runs - use a more efficient approach
+	// Instead of TBidiRun with vectors, use start/end indices
+	struct TRunInfo { int start; int end; EBidiDir dir; };
+	thread_local static std::vector<TRunInfo> s_runs;
+	s_runs.clear();
+	s_runs.reserve((size_t)std::max(4, n / 50));

 	EBidiDir lastStrong = base;
+	EBidiDir currentRunDir = base;
+	int runStart = 0;

 	for (int i = 0; i < n; ++i)
 	{
-		wchar_t ch = s[i];
-
 		EBidiDir d;
 		ECharDir cd = GetCharDirSmart(s, n, i);

@@ -607,98 +804,84 @@ static std::vector<wchar_t> BuildVisualBidiText_Tagless(const wchar_t* s, int n,
 			d = ResolveNeutralDir(s, n, i, base, lastStrong, &strongCache);
 		}

-#ifdef DEBUG_BIDI
-		if (i < 50) // Only log first 50 chars to avoid spam
+		// Start a new run if direction changes
+		if (d != currentRunDir)
 		{
-			BIDI_LOG("Char[%d] U+%04X '%lc' → CharDir=%s, RunDir=%s",
-				i, (unsigned int)ch, (ch >= 32 && ch < 127) ? ch : L'?',
-				cd == ECharDir::RTL ? "RTL" : (cd == ECharDir::LTR ? "LTR" : "Neutral"),
-				d == EBidiDir::RTL ? "RTL" : "LTR");
+			if (i > runStart)
+				s_runs.push_back({runStart, i, currentRunDir});
+			runStart = i;
+			currentRunDir = d;
 		}
-#endif
-
-		push_run(d);
-		runs.back().text.push_back(ch);
 	}
+	// Push final run
+	if (n > runStart)
+		s_runs.push_back({runStart, n, currentRunDir});

-	// 3) shape RTL runs in logical order (Arabic shaping)
-	for (auto& r : runs)
+	// 3) shape RTL runs using pooled buffer
+	buffers.shaped.clear();
+
+	auto shapeRun = [&](int start, int end) -> std::pair<const wchar_t*, int>
 	{
-		if (r.dir != EBidiDir::RTL)
-			continue;
+		int len = end - start;
+		if (len <= 0)
+			return {nullptr, 0};

-		if (r.text.empty())
-			continue;
+		// Check for potential integer overflow
+		if ((size_t)len > SIZE_MAX / ARABIC_SHAPING_EXPANSION_FACTOR_RETRY - ARABIC_SHAPING_SAFETY_MARGIN_RETRY)
+			return {s + start, len}; // Return unshaped

-		// Check for potential integer overflow before allocation
-		if (r.text.size() > SIZE_MAX / ARABIC_SHAPING_EXPANSION_FACTOR_RETRY - ARABIC_SHAPING_SAFETY_MARGIN_RETRY)
-		{
-			BIDI_LOG("BuildVisualBidiText: RTL run too large for shaping (%zu chars)", r.text.size());
-			continue; // Text too large to process safely
-		}
+		size_t neededSize = buffers.shaped.size() + (size_t)len * ARABIC_SHAPING_EXPANSION_FACTOR + ARABIC_SHAPING_SAFETY_MARGIN;
+		if (buffers.shaped.capacity() < neededSize)
+			buffers.shaped.reserve(neededSize);

-		std::vector<wchar_t> shaped(r.text.size() * ARABIC_SHAPING_EXPANSION_FACTOR + ARABIC_SHAPING_SAFETY_MARGIN, 0);
+		size_t outStart = buffers.shaped.size();
+		buffers.shaped.resize(outStart + (size_t)len * ARABIC_SHAPING_EXPANSION_FACTOR + ARABIC_SHAPING_SAFETY_MARGIN);
+
+		int outLen = Arabic_MakeShape(const_cast<wchar_t*>(s + start), len,
+		                               buffers.shaped.data() + outStart,
+		                               (int)(buffers.shaped.size() - outStart));

-		int outLen = Arabic_MakeShape(r.text.data(), (int)r.text.size(), shaped.data(), (int)shaped.size());
 		if (outLen <= 0)
-		{
-			BIDI_LOG("Arabic_MakeShape FAILED for RTL run of %zu characters", r.text.size());
-			BIDI_LOG("  WARNING: This RTL text segment will NOT be displayed!");
-			BIDI_LOG("  First few characters: U+%04X U+%04X U+%04X U+%04X",
-				r.text.size() > 0 ? (unsigned int)r.text[0] : 0,
-				r.text.size() > 1 ? (unsigned int)r.text[1] : 0,
-				r.text.size() > 2 ? (unsigned int)r.text[2] : 0,
-				r.text.size() > 3 ? (unsigned int)r.text[3] : 0);
-			continue;
-		}
+			return {s + start, len}; // Return unshaped on failure

-		// Retry once if buffer too small
-		if (outLen >= (int)shaped.size())
-		{
-			shaped.assign(r.text.size() * ARABIC_SHAPING_EXPANSION_FACTOR_RETRY + ARABIC_SHAPING_SAFETY_MARGIN_RETRY, 0);
-			outLen = Arabic_MakeShape(r.text.data(), (int)r.text.size(), shaped.data(), (int)shaped.size());
-			if (outLen <= 0)
-				continue;
-			// Add error check instead of silent truncation
-			if (outLen > (int)shaped.size())
-			{
-				BIDI_LOG("Arabic_MakeShape: Buffer still too small after retry (%d > %zu)", outLen, shaped.size());
-				// Shaping failed critically, use unshaped text
-				continue;
-			}
-		}
+		buffers.shaped.resize(outStart + (size_t)outLen);
+		return {buffers.shaped.data() + outStart, outLen};
+	};

-		r.text.assign(shaped.begin(), shaped.begin() + outLen);
-	}
-
-	// 4) produce visual order:
-	// - reverse RTL runs internally
-	// - reverse run sequence if base RTL
+	// 4) produce visual order
 	std::vector<wchar_t> visual;
 	visual.reserve((size_t)n);

-	auto emit_run = [&](const TBidiRun& r)
+	auto emitRun = [&](const TRunInfo& run)
+	{
+		if (run.dir == EBidiDir::RTL)
 		{
-			if (r.dir == EBidiDir::RTL)
+			// Shape and reverse RTL runs
+			std::pair<const wchar_t*, int> shaped = shapeRun(run.start, run.end);
+			const wchar_t* ptr = shaped.first;
+			int len = shaped.second;
+			if (ptr && len > 0)
 			{
-				for (int k = (int)r.text.size() - 1; k >= 0; --k)
-					visual.push_back(r.text[(size_t)k]);
+				for (int k = len - 1; k >= 0; --k)
+					visual.push_back(ptr[k]);
 			}
-			else
-			{
-				visual.insert(visual.end(), r.text.begin(), r.text.end());
-			}
-		};
+		}
+		else
+		{
+			// LTR runs: copy directly
+			visual.insert(visual.end(), s + run.start, s + run.end);
+		}
+	};

 	if (base == EBidiDir::LTR)
 	{
-		for (const auto& r : runs)
-			emit_run(r);
+		for (const auto& run : s_runs)
+			emitRun(run);
 	}
 	else
 	{
-		for (int i = (int)runs.size() - 1; i >= 0; --i)
-			emit_run(runs[(size_t)i]);
+		for (int i = (int)s_runs.size() - 1; i >= 0; --i)
+			emitRun(s_runs[(size_t)i]);
 	}

 	return visual;
@@ -763,7 +946,7 @@ static inline std::vector<wchar_t> BuildVisualChatMessage(
 		{
 			// Apply BiDi to message with auto-detection (don't force RTL)
 			// Let the BiDi algorithm detect base direction from first strong character
-			std::vector<wchar_t> msgVisual = BuildVisualBidiText_Tagless(msg, msgLen, false);
+			std::vector<wchar_t> msgVisual = BuildVisualBidiText_Tagless(msg, msgLen, forceRTL);
 			visual.insert(visual.end(), msgVisual.begin(), msgVisual.end());
 		}
 		visual.push_back(L' ');
@@ -787,7 +970,7 @@ static inline std::vector<wchar_t> BuildVisualChatMessage(
 		{
 			// Apply BiDi to message with auto-detection (don't force RTL)
 			// Let the BiDi algorithm detect base direction from first strong character
-			std::vector<wchar_t> msgVisual = BuildVisualBidiText_Tagless(msg, msgLen, false);
+			std::vector<wchar_t> msgVisual = BuildVisualBidiText_Tagless(msg, msgLen, forceRTL);
 			visual.insert(visual.end(), msgVisual.begin(), msgVisual.end());
 		}
 	}
--- a/src/EterBase/Debug.cpp
+++ b/src/EterBase/Debug.cpp
@@ -16,34 +16,93 @@ const DWORD DEBUG_STRING_MAX_LEN = 1024;
 static int isLogFile = false;
 HWND g_PopupHwnd = NULL;

-// Convert UTF-8 char* -> wide and send to debugger (NO helper function, just a macro)
+// ============================================================================
+// OPTIMIZED LOGGING INFRASTRUCTURE
+// ============================================================================
+
+// Cached timestamp to avoid repeated time()/localtime() syscalls
+// Refreshes every ~100ms (good enough for logging, avoids syscall overhead)
+struct TCachedTimestamp
+{
+    DWORD lastUpdateMs = 0;
+    int month = 0;
+    int day = 0;
+    int hour = 0;
+    int minute = 0;
+
+    void Update()
+    {
+        DWORD now = ELTimer_GetMSec();
+        // Refresh timestamp every 100ms (not per-call)
+        if (now - lastUpdateMs > 100)
+        {
+            time_t ct = time(0);
+            struct tm ctm = *localtime(&ct);
+            month = ctm.tm_mon + 1;
+            day = ctm.tm_mday;
+            hour = ctm.tm_hour;
+            minute = ctm.tm_min;
+            lastUpdateMs = now;
+        }
+    }
+
+    void Format(char* buf, size_t bufSize) const
+    {
+        DWORD msec = ELTimer_GetMSec() % 60000;
+        _snprintf_s(buf, bufSize, _TRUNCATE, "%02d%02d %02d:%02d:%05d :: ",
+            month, day, hour, minute, (int)msec);
+    }
+};
+
+static TCachedTimestamp g_cachedTimestamp;
+
+// Optimized debug output: Fast path for ASCII strings (avoids Utf8ToWide allocation)
 #ifdef _DEBUG
 #define DBG_OUT_W_UTF8(psz)                                                   \
    do {                                                                      \
        const char* __s = (psz) ? (psz) : "";                                 \
-        std::wstring __w = Utf8ToWide(__s);                                   \
-        OutputDebugStringW(__w.c_str());                                      \
+        size_t __len = strlen(__s);                                           \
+        if (Utf8Fast::IsAsciiOnly(__s, __len)) {                              \
+            /* ASCII fast path: direct conversion, no allocation */           \
+            wchar_t __wbuf[512];                                              \
+            size_t __wlen = (__len < 511) ? __len : 511;                      \
+            for (size_t __i = 0; __i < __wlen; ++__i)                         \
+                __wbuf[__i] = (wchar_t)(unsigned char)__s[__i];               \
+            __wbuf[__wlen] = L'\0';                                           \
+            OutputDebugStringW(__wbuf);                                       \
+        } else {                                                              \
+            /* Non-ASCII: use full conversion */                              \
+            std::wstring __w = Utf8ToWide(__s);                               \
+            OutputDebugStringW(__w.c_str());                                  \
+        }                                                                     \
    } while (0)
 #else
 #define DBG_OUT_W_UTF8(psz) do { (void)(psz); } while (0)
 #endif

+// Buffered log file writer
+// OPTIMIZATION: Buffered writes with periodic flush instead of per-write fflush()
+// - Collects writes in memory buffer
+// - Flushes when buffer is full OR every 500ms OR on shutdown
+// - Reduces disk I/O from 1000s of syncs to ~2 per second
 class CLogFile : public CSingleton<CLogFile>
 {
    public:
-        CLogFile() : m_fp(NULL) {}
+        CLogFile() : m_fp(NULL), m_bufferPos(0), m_lastFlushMs(0) {}

        virtual ~CLogFile()
        {
+            Flush(); // Ensure all buffered data is written
            if (m_fp)
                fclose(m_fp);
-
            m_fp = NULL;
        }

        void Initialize()
        {
            m_fp = fopen("log/log.txt", "w");
+            m_bufferPos = 0;
+            m_lastFlushMs = ELTimer_GetMSec();
        }

        void Write(const char* c_pszMsg)
@@ -51,22 +110,63 @@ class CLogFile : public CSingleton<CLogFile>
            if (!m_fp)
                return;

-            time_t ct = time(0);
-            struct tm ctm = *localtime(&ct);
+            // Use cached timestamp (updated every ~100ms)
+            g_cachedTimestamp.Update();
+            char timestamp[32];
+            g_cachedTimestamp.Format(timestamp, sizeof(timestamp));

-            fprintf(m_fp, "%02d%02d %02d:%02d:%05d :: %s",
-                ctm.tm_mon + 1,
-                ctm.tm_mday,
-                ctm.tm_hour,
-                ctm.tm_min,
-                ELTimer_GetMSec() % 60000,
-                c_pszMsg);
+            // Calculate total length needed
+            size_t timestampLen = strlen(timestamp);
+            size_t msgLen = c_pszMsg ? strlen(c_pszMsg) : 0;
+            size_t totalLen = timestampLen + msgLen;

+            // If this write would overflow the buffer, flush first
+            if (m_bufferPos + totalLen >= BUFFER_SIZE - 1)
+                Flush();
+
+            // If message is larger than buffer, write directly (rare case)
+            if (totalLen >= BUFFER_SIZE - 1)
+            {
+                fputs(timestamp, m_fp);
+                if (c_pszMsg)
+                    fputs(c_pszMsg, m_fp);
+                fflush(m_fp);
+                return;
+            }
+
+            // Append to buffer
+            memcpy(m_buffer + m_bufferPos, timestamp, timestampLen);
+            m_bufferPos += timestampLen;
+            if (msgLen > 0)
+            {
+                memcpy(m_buffer + m_bufferPos, c_pszMsg, msgLen);
+                m_bufferPos += msgLen;
+            }
+
+            // Periodic flush: every 500ms or when buffer is >75% full
+            DWORD now = ELTimer_GetMSec();
+            if (now - m_lastFlushMs > 500 || m_bufferPos > BUFFER_SIZE * 3 / 4)
+                Flush();
+        }
+
+        void Flush()
+        {
+            if (!m_fp || m_bufferPos == 0)
+                return;
+
+            m_buffer[m_bufferPos] = '\0';
+            fputs(m_buffer, m_fp);
            fflush(m_fp);
+            m_bufferPos = 0;
+            m_lastFlushMs = ELTimer_GetMSec();
        }

    protected:
+        static const size_t BUFFER_SIZE = 8192; // 8KB buffer
        FILE* m_fp;
+        char m_buffer[BUFFER_SIZE];
+        size_t m_bufferPos;
+        DWORD m_lastFlushMs;
 };

 static CLogFile gs_logfile;
@@ -220,9 +320,50 @@ void Tracef(const char* c_szFormat, ...)
        LogFile(szBuf);
 }

+// Buffered stderr writer for syserr (same pattern as CLogFile)
+// OPTIMIZATION: Reduces fflush(stderr) from every call to every 500ms
+static struct TSyserrBuffer
+{
+    static const size_t BUFFER_SIZE = 4096;
+    char buffer[BUFFER_SIZE];
+    size_t pos = 0;
+    DWORD lastFlushMs = 0;
+
+    void Write(const char* msg, size_t len)
+    {
+        if (pos + len >= BUFFER_SIZE - 1)
+            Flush();
+
+        if (len >= BUFFER_SIZE - 1)
+        {
+            // Large message: write directly
+            fwrite(msg, 1, len, stderr);
+            fflush(stderr);
+            return;
+        }
+
+        memcpy(buffer + pos, msg, len);
+        pos += len;
+
+        DWORD now = ELTimer_GetMSec();
+        if (now - lastFlushMs > 500 || pos > BUFFER_SIZE * 3 / 4)
+            Flush();
+    }
+
+    void Flush()
+    {
+        if (pos == 0)
+            return;
+        fwrite(buffer, 1, pos, stderr);
+        fflush(stderr);
+        pos = 0;
+        lastFlushMs = ELTimer_GetMSec();
+    }
+} g_syserrBuffer;
+
 void TraceError(const char* c_szFormat, ...)
 {
-//#ifndef _DISTRIBUTE 
+//#ifndef _DISTRIBUTE
    char szBuf[DEBUG_STRING_MAX_LEN + 2];

    strncpy_s(szBuf, sizeof(szBuf), "SYSERR: ", _TRUNCATE);
@@ -243,17 +384,14 @@ void TraceError(const char* c_szFormat, ...)
        szBuf[sizeof(szBuf) - 1] = '\0';
    }

-    time_t ct = time(0);
-    struct tm ctm = *localtime(&ct);
+    // OPTIMIZED: Use cached timestamp instead of time()/localtime() per call
+    g_cachedTimestamp.Update();
+    char timestamp[32];
+    g_cachedTimestamp.Format(timestamp, sizeof(timestamp));

-    fprintf(stderr, "%02d%02d %02d:%02d:%05d :: %s",
-        ctm.tm_mon + 1,
-        ctm.tm_mday,
-        ctm.tm_hour,
-        ctm.tm_min,
-        ELTimer_GetMSec() % 60000,
-        szBuf + 8);
-    fflush(stderr);
+    // OPTIMIZED: Write to buffered stderr instead of fprintf+fflush per call
+    g_syserrBuffer.Write(timestamp, strlen(timestamp));
+    g_syserrBuffer.Write(szBuf + 8, strlen(szBuf + 8)); // Skip "SYSERR: " prefix for stderr

 #ifdef _DEBUG
    DBG_OUT_W_UTF8(szBuf);
@@ -267,8 +405,7 @@ void TraceError(const char* c_szFormat, ...)

 void TraceErrorWithoutEnter(const char* c_szFormat, ...)
 {
-//#ifndef _DISTRIBUTE 
-
+//#ifndef _DISTRIBUTE
    char szBuf[DEBUG_STRING_MAX_LEN];

    va_list args;
@@ -276,17 +413,14 @@ void TraceErrorWithoutEnter(const char* c_szFormat, ...)
    _vsnprintf_s(szBuf, sizeof(szBuf), _TRUNCATE, c_szFormat, args);
    va_end(args);

-    time_t ct = time(0);
-    struct tm ctm = *localtime(&ct);
+    // OPTIMIZED: Use cached timestamp instead of time()/localtime() per call
+    g_cachedTimestamp.Update();
+    char timestamp[32];
+    g_cachedTimestamp.Format(timestamp, sizeof(timestamp));

-    fprintf(stderr, "%02d%02d %02d:%02d:%05d :: %s",
-        ctm.tm_mon + 1,
-        ctm.tm_mday,
-        ctm.tm_hour,
-        ctm.tm_min,
-        ELTimer_GetMSec() % 60000,
-        szBuf + 8);
-    fflush(stderr);
+    // OPTIMIZED: Write to buffered stderr instead of fprintf+fflush per call
+    g_syserrBuffer.Write(timestamp, strlen(timestamp));
+    g_syserrBuffer.Write(szBuf, strlen(szBuf));

 #ifdef _DEBUG
    DBG_OUT_W_UTF8(szBuf);
@@ -349,7 +483,7 @@ void OpenLogFile(bool bUseLogFIle)
        std::filesystem::create_directory("log");
    }

-//#ifndef _DISTRIBUTE 
+//#ifndef _DISTRIBUTE
    _wfreopen(L"log/syserr.txt", L"w", stderr);

    if (bUseLogFIle)
@@ -360,6 +494,13 @@ void OpenLogFile(bool bUseLogFIle)
 //#endif
 }

+void CloseLogFile()
+{
+    // Flush all buffered output before shutdown
+    g_syserrBuffer.Flush();
+    CLogFile::Instance().Flush();
+}
+
 void OpenConsoleWindow()
 {
    AllocConsole();
--- a/src/EterLib/GrpTextInstance.cpp
+++ b/src/EterLib/GrpTextInstance.cpp
@@ -238,16 +238,88 @@ void CGraphicTextInstance::Update()
 	}

 	// Tag-aware BiDi rendering: Parse tags, apply BiDi per segment, track colors/hyperlinks
+	// OPTIMIZED: Use helper lambda to eliminate code duplication (was repeated 5+ times)
 	if (hasRTL || hasTags)
 	{
 		DWORD currentColor = dwColor;
 		int hyperlinkStep = 0; // 0=normal, 1=collecting metadata, 2=visible hyperlink
 		std::wstring hyperlinkMetadata;
-		std::vector<wchar_t> currentSegment;
+
+		// Use thread-local buffer to avoid per-call allocation
+		thread_local static std::vector<wchar_t> s_currentSegment;
+		s_currentSegment.clear();

 		SHyperlink currentHyperlink;
 		currentHyperlink.sx = currentHyperlink.ex = 0;

+		// In chat RTL, force RTL base direction so prefixes like "[hyperlink]" don't flip the paragraph to LTR.
+		const bool forceRTLForBidi = (m_isChatMessage && m_computedRTL);
+
+		// OPTIMIZED: Single helper function for flushing segments (eliminates 5x code duplication)
+		auto FlushSegment = [&](DWORD segColor) -> int
+		{
+			if (s_currentSegment.empty())
+				return 0;
+
+			int totalWidth = 0;
+
+			// Apply BiDi transformation using optimized BuildVisualBidiText_Tagless
+			std::vector<wchar_t> visual = BuildVisualBidiText_Tagless(
+				s_currentSegment.data(), (int)s_currentSegment.size(), forceRTLForBidi);
+
+			for (size_t j = 0; j < visual.size(); ++j)
+			{
+				int w = __DrawCharacter(pFontTexture, visual[j], segColor);
+				totalWidth += w;
+			}
+
+			s_currentSegment.clear();
+			return totalWidth;
+		};
+
+		// Prepend glyphs to the already-built draw list (used to place hyperlink before message in RTL chat).
+		auto PrependGlyphs = [&](CGraphicFontTexture* pFontTexture,
+		                         const std::vector<wchar_t>& chars,
+		                         DWORD color,
+		                         int& outWidth)
+		{
+			outWidth = 0;
+
+			// Use thread-local buffers to avoid allocation
+			thread_local static std::vector<CGraphicFontTexture::TCharacterInfomation*> s_newCharInfos;
+			thread_local static std::vector<DWORD> s_newColors;
+			s_newCharInfos.clear();
+			s_newColors.clear();
+			s_newCharInfos.reserve(chars.size());
+			s_newColors.reserve(chars.size());
+
+			for (size_t k = 0; k < chars.size(); ++k)
+			{
+				auto* pInfo = pFontTexture->GetCharacterInfomation(chars[k]);
+				if (!pInfo)
+					continue;
+
+				s_newCharInfos.push_back(pInfo);
+				s_newColors.push_back(color);
+
+				outWidth += pInfo->advance;
+				m_textHeight = std::max((WORD)pInfo->height, m_textHeight);
+			}
+
+			// Insert at the beginning of the draw list.
+			m_pCharInfoVector.insert(m_pCharInfoVector.begin(), s_newCharInfos.begin(), s_newCharInfos.end());
+			m_dwColorInfoVector.insert(m_dwColorInfoVector.begin(), s_newColors.begin(), s_newColors.end());
+
+			// Shift any already-recorded hyperlinks to the right.
+			for (auto& link : m_hyperlinkVector)
+			{
+				link.sx += outWidth;
+				link.ex += outWidth;
+			}
+
+			m_textWidth += outWidth;
+		};
+
 		// Parse text with tags
 		for (int i = 0; i < wTextLen;)
 		{
@@ -257,37 +329,15 @@ void CGraphicTextInstance::Update()

 			if (tagType == TEXT_TAG_COLOR)
 			{
-				// Flush current segment with BiDi before changing color
-				if (!currentSegment.empty())
-				{
-					// Use auto-detection for BiDi (don't force RTL)
-					std::vector<wchar_t> visual = BuildVisualBidiText_Tagless(
-						currentSegment.data(), (int)currentSegment.size(), false);
-					for (size_t j = 0; j < visual.size(); ++j)
-					{
-						int w = __DrawCharacter(pFontTexture, visual[j], currentColor);
-						currentHyperlink.ex += w;
-					}
-					currentSegment.clear();
-				}
+				// Flush current segment before changing color
+				currentHyperlink.ex += FlushSegment(currentColor);
 				currentColor = htoi(tagExtra.c_str(), 8);
 				i += tagLen;
 			}
 			else if (tagType == TEXT_TAG_RESTORE_COLOR)
 			{
 				// Flush segment before restoring color
-				if (!currentSegment.empty())
-				{
-					// Use auto-detection for BiDi (don't force RTL)
-					std::vector<wchar_t> visual = BuildVisualBidiText_Tagless(
-						currentSegment.data(), (int)currentSegment.size(), false);
-					for (size_t j = 0; j < visual.size(); ++j)
-					{
-						int w = __DrawCharacter(pFontTexture, visual[j], currentColor);
-						currentHyperlink.ex += w;
-					}
-					currentSegment.clear();
-				}
+				currentHyperlink.ex += FlushSegment(currentColor);
 				currentColor = dwColor;
 				i += tagLen;
 			}
@@ -303,18 +353,7 @@ void CGraphicTextInstance::Update()
 				{
 					// End of metadata, start visible section
 					// Flush any pending non-hyperlink segment first
-					if (!currentSegment.empty())
-					{
-						// Use auto-detection for BiDi (don't force RTL)
-						std::vector<wchar_t> visual = BuildVisualBidiText_Tagless(
-							currentSegment.data(), (int)currentSegment.size(), false);
-						for (size_t j = 0; j < visual.size(); ++j)
-						{
-							int w = __DrawCharacter(pFontTexture, visual[j], currentColor);
-							currentHyperlink.ex += w;
-						}
-						currentSegment.clear();
-					}
+					currentHyperlink.ex += FlushSegment(currentColor);

 					hyperlinkStep = 2;
 					currentHyperlink.text = hyperlinkMetadata;
@@ -323,80 +362,85 @@ void CGraphicTextInstance::Update()
 				else if (hyperlinkStep == 2)
 				{
 					// End of visible section - render hyperlink text with proper Arabic handling
-					// Format: [Arabic Text] or [English Text]
-					// Keep brackets in position, reverse Arabic content between them
-					if (!currentSegment.empty())
+					// In RTL chat: we want the hyperlink chunk to appear BEFORE the message, even if logically appended.
+					if (!s_currentSegment.empty())
 					{
-						// Find bracket positions
+						// OPTIMIZED: Use thread-local buffer for visible rendering
+						thread_local static std::vector<wchar_t> s_visibleToRender;
+						s_visibleToRender.clear();
+
+						// Find bracket positions: [ ... ]
 						int openBracket = -1, closeBracket = -1;
-						for (size_t idx = 0; idx < currentSegment.size(); ++idx)
+						for (size_t idx = 0; idx < s_currentSegment.size(); ++idx)
 						{
-							if (currentSegment[idx] == L'[' && openBracket == -1)
+							if (s_currentSegment[idx] == L'[' && openBracket == -1)
 								openBracket = (int)idx;
-							else if (currentSegment[idx] == L']' && closeBracket == -1)
+							else if (s_currentSegment[idx] == L']' && closeBracket == -1)
 								closeBracket = (int)idx;
 						}

 						if (openBracket >= 0 && closeBracket > openBracket)
 						{
-							// Extract content between brackets
-							std::vector<wchar_t> content(
-								currentSegment.begin() + openBracket + 1,
-								currentSegment.begin() + closeBracket);
+							// Keep '['
+							s_visibleToRender.push_back(L'[');

-							// Apply Arabic shaping to content
-							std::vector<wchar_t> shaped(content.size() * 2 + 16, 0);
-							int shapedLen = Arabic_MakeShape(content.data(), (int)content.size(),
-							                                 shaped.data(), (int)shaped.size());
+							// Extract inside content and apply BiDi
+							thread_local static std::vector<wchar_t> s_content;
+							s_content.assign(
+								s_currentSegment.begin() + openBracket + 1,
+								s_currentSegment.begin() + closeBracket);

-							// Render: "[" + reversed_arabic + "]"
-							// 1. Opening bracket
-							int w = __DrawCharacter(pFontTexture, L'[', currentColor);
-							currentHyperlink.ex += w;
+							// FIX: Use false to let BiDi auto-detect direction from content
+							// This ensures English items like [Sword+9] stay LTR
+							// while Arabic items like [درع فولاذي+9] are properly RTL
+							std::vector<wchar_t> visual = BuildVisualBidiText_Tagless(
+								s_content.data(), (int)s_content.size(), false);

-							// 2. Arabic content (shaped and REVERSED for RTL display)
-							if (shapedLen > 0)
-							{
-								for (int j = shapedLen - 1; j >= 0; --j)
-								{
-									w = __DrawCharacter(pFontTexture, shaped[j], currentColor);
-									currentHyperlink.ex += w;
-								}
-							}
-							else
-							{
-								// Fallback: reverse original content
-								for (int j = (int)content.size() - 1; j >= 0; --j)
-								{
-									w = __DrawCharacter(pFontTexture, content[j], currentColor);
-									currentHyperlink.ex += w;
-								}
-							}
+							s_visibleToRender.insert(s_visibleToRender.end(), visual.begin(), visual.end());

-							// 3. Closing bracket
-							w = __DrawCharacter(pFontTexture, L']', currentColor);
-							currentHyperlink.ex += w;
-
-							// 4. Render any text after closing bracket (if any)
-							for (size_t idx = closeBracket + 1; idx < currentSegment.size(); ++idx)
-							{
-								w = __DrawCharacter(pFontTexture, currentSegment[idx], currentColor);
-								currentHyperlink.ex += w;
-							}
+							// Keep ']'
+							s_visibleToRender.push_back(L']');
 						}
 						else
 						{
-							// No brackets found - render as-is (shouldn't happen for hyperlinks)
-							for (size_t j = 0; j < currentSegment.size(); ++j)
+							// No brackets: apply BiDi to whole segment
+							// FIX: Use false to let BiDi auto-detect direction from content
+							std::vector<wchar_t> visual = BuildVisualBidiText_Tagless(
+								s_currentSegment.data(), (int)s_currentSegment.size(), false);
+
+							s_visibleToRender.insert(s_visibleToRender.end(), visual.begin(), visual.end());
+						}
+
+						// Ensure a space AFTER the hyperlink chunk (so it becomes "[hyperlink] اختبار...")
+						s_visibleToRender.push_back(L' ');
+
+						// Key behavior:
+						// In RTL chat, place hyperlink BEFORE the message by prepending glyphs.
+						if (m_isChatMessage && m_computedRTL)
+						{
+							int addedWidth = 0;
+							PrependGlyphs(pFontTexture, s_visibleToRender, currentColor, addedWidth);
+
+							// Record the hyperlink range at the beginning (0..addedWidth)
+							currentHyperlink.sx = 0;
+							currentHyperlink.ex = addedWidth;
+							m_hyperlinkVector.push_back(currentHyperlink);
+						}
+						else
+						{
+							// LTR or non-chat: keep original "append" behavior
+							currentHyperlink.sx = currentHyperlink.ex;
+							for (size_t j = 0; j < s_visibleToRender.size(); ++j)
 							{
-								int w = __DrawCharacter(pFontTexture, currentSegment[j], currentColor);
+								int w = __DrawCharacter(pFontTexture, s_visibleToRender[j], currentColor);
 								currentHyperlink.ex += w;
 							}
+							m_hyperlinkVector.push_back(currentHyperlink);
 						}
-						currentSegment.clear();
 					}
-					m_hyperlinkVector.push_back(currentHyperlink);
+
 					hyperlinkStep = 0;
+					s_currentSegment.clear();
 				}
 				i += tagLen;
 			}
@@ -411,24 +455,14 @@ void CGraphicTextInstance::Update()
 				{
 					// Add to current segment
 					// Will be BiDi-processed for normal text, or rendered directly for hyperlinks
-					currentSegment.push_back(wTextBuf[i]);
+					s_currentSegment.push_back(wTextBuf[i]);
 				}
 				i += tagLen;
 			}
 		}

-		// Flush any remaining segment
-		if (!currentSegment.empty())
-		{
-			// Use auto-detection for BiDi (don't force RTL)
-			std::vector<wchar_t> visual = BuildVisualBidiText_Tagless(
-				currentSegment.data(), (int)currentSegment.size(), false);
-			for (size_t j = 0; j < visual.size(); ++j)
-			{
-				int w = __DrawCharacter(pFontTexture, visual[j], currentColor);
-				currentHyperlink.ex += w;
-			}
-		}
+		// Flush any remaining segment using optimized helper
+		currentHyperlink.ex += FlushSegment(currentColor);

 		pFontTexture->UpdateTexture();
 		m_isUpdate = true;
--- a/src/EterLocale/Arabic.cpp
+++ b/src/EterLocale/Arabic.cpp
@@ -1,6 +1,7 @@
 #include "StdAfx.h"
 #include "Arabic.h"
 #include <assert.h>
+#include <vector>

 enum ARABIC_CODE
 {
@@ -243,110 +244,151 @@ bool Arabic_IsComb2(wchar_t code)
 	return false;
 }

+// Helper: Check if a character can join to the right (has INITIAL or MEDIAL form)
+static inline bool Arabic_CanJoinRight(wchar_t code)
+{
+	if (!Arabic_IsInMap(code))
+		return false;
+	return Arabic_GetMap(code, INITIAL) != 0 || Arabic_GetMap(code, MEDIAL) != 0;
+}
+
+// Helper: Check if a character can join to the left (has MEDIAL or FINAL form)
+static inline bool Arabic_CanJoinLeft(wchar_t code)
+{
+	if (!Arabic_IsInMap(code))
+		return false;
+	return Arabic_GetMap(code, MEDIAL) != 0 || Arabic_GetMap(code, FINAL) != 0 || Arabic_IsNext(code);
+}
+
+// Optimized O(n) Arabic shaping algorithm
+// Previous: O(n²) due to backward/forward scans for each character
+// Now: O(n) single forward pass with state tracking
 size_t Arabic_MakeShape(wchar_t* src, size_t srcLen, wchar_t* dst, size_t dstLen)
 {
-	// Runtime validation instead of assert (which is disabled in release builds)
+	// Runtime validation
 	if (!src || !dst || srcLen == 0 || dstLen < srcLen)
 		return 0;

-	const size_t srcLastIndex = srcLen - 1;
+	// Phase 1: Pre-scan to find the next non-composing Arabic letter for each position
+	// This converts O(n) inner loops into O(1) lookups
+	// Use thread-local buffer to avoid per-call allocation
+	thread_local static std::vector<size_t> s_nextArabic;
+	if (s_nextArabic.size() < srcLen + 1)
+		s_nextArabic.resize(srcLen + 1);
+
+	// Build next-arabic lookup (reverse scan)
+	size_t nextArabicIdx = srcLen; // Invalid index = no next arabic
+	for (size_t i = srcLen; i > 0; --i)
+	{
+		size_t idx = i - 1;
+		s_nextArabic[idx] = nextArabicIdx;
+
+		wchar_t ch = src[idx];
+		if (Arabic_IsInMap(ch) && !Arabic_IsInComposing(ch))
+			nextArabicIdx = idx;
+	}
+	s_nextArabic[srcLen] = srcLen; // Sentinel
+
+	// Phase 2: Single forward pass with state tracking
+	size_t dstIndex = 0;
+	bool prevJoins = false; // Does previous Arabic letter join to the right?

-	unsigned dstIndex = 0;	
 	for (size_t srcIndex = 0; srcIndex < srcLen; ++srcIndex)
 	{
 		wchar_t cur = src[srcIndex];

-		//printf("now %x\n", cur);
+		// Composing marks: copy directly, don't affect joining state
+		if (Arabic_IsInComposing(cur))
+		{
+			if (dstIndex < dstLen)
+				dst[dstIndex++] = cur;
+			continue;
+		}

 		if (Arabic_IsInMap(cur))
 		{
-			// 이전 글자 얻어내기
-			wchar_t prev = 0;			
-			{
-				size_t prevIndex = srcIndex;
-				while (prevIndex > 0)
-				{
-					prevIndex--;
-					prev = src[prevIndex];
-					//printf("\tprev %d:%x\n", prevIndex, cur);
-					if (Arabic_IsInComposing(prev))
-						continue;
-					else
-						break;
-				}
-
-				if ((srcIndex == 0) || 
-					(!Arabic_IsInMap(prev)) || 
-					(!Arabic_GetMap(prev, INITIAL) && !Arabic_GetMap(prev, MEDIAL)))
-				{
-					//printf("\tprev not defined\n");
-					prev = 0;
-				}
-			}
-
-			// 다음 글자 얻어내기
+			// Find next joinable Arabic letter using pre-computed lookup
 			wchar_t next = 0;
+			size_t nextIdx = s_nextArabic[srcIndex];
+			if (nextIdx < srcLen)
 			{
-				size_t nextIndex = srcIndex;
-				while (nextIndex < srcLastIndex)
-				{
-					nextIndex++;
-					next = src[nextIndex];
-					if (Arabic_IsInComposing(next))
-						continue;
-					else
-						break;
-				}
-
-				if ((nextIndex == srcLen) || 
-					(!Arabic_IsInMap(next)) ||
-					(!Arabic_GetMap(next, MEDIAL) && !Arabic_GetMap(next, FINAL) && !Arabic_IsNext(next)))
-				{
-					//printf("\tnext not defined\n");
-					next = 0;
-				}
+				wchar_t nextChar = src[nextIdx];
+				if (Arabic_CanJoinLeft(nextChar))
+					next = nextChar;
 			}

-			if (Arabic_IsComb1(cur) && Arabic_IsComb2(next))
+			// Handle LAM-ALEF composition
+			if (Arabic_IsComb1(cur) && nextIdx < srcLen && Arabic_IsComb2(src[nextIdx]))
 			{
-				if (prev)
-					dst[dstIndex] = Arabic_GetComposition(cur, next, FINAL);
+				wchar_t composed;
+				if (prevJoins)
+					composed = Arabic_GetComposition(cur, src[nextIdx], FINAL);
 				else
-					dst[dstIndex] = Arabic_GetComposition(cur, next, ISOLATED);
+					composed = Arabic_GetComposition(cur, src[nextIdx], ISOLATED);

-				//printf("\tGot me a complex:%x\n", dst[dstIndex]);
+				if (dstIndex < dstLen)
+					dst[dstIndex++] = composed;

-				srcIndex++;
-				dstIndex++;				
+				// Skip the ALEF that was combined
+				srcIndex = nextIdx;
+				// LAM-ALEF doesn't join to the right
+				prevJoins = false;
+				continue;
 			}
-			else if (prev && next && (dst[dstIndex] = Arabic_GetMap(cur, MEDIAL)))
+
+			// Determine form based on joining context
+			wchar_t shaped = 0;
+			bool curJoinsRight = false;
+
+			if (prevJoins && next)
 			{
-				//printf("\tGot prev & next:%x\n", dst[dstIndex]);
-				dstIndex++;				
+				// Both sides join: MEDIAL
+				shaped = Arabic_GetMap(cur, MEDIAL);
+				if (shaped)
+					curJoinsRight = Arabic_CanJoinRight(cur);
 			}
-			else if (prev && (dst[dstIndex] = Arabic_GetMap(cur, FINAL)))
+
+			if (!shaped && prevJoins)
 			{
-				//printf("\tGot prev:%x\n", dst[dstIndex]);
-				dstIndex++;				
+				// Only left joins: FINAL
+				shaped = Arabic_GetMap(cur, FINAL);
+				// FINAL form doesn't extend to the right
+				curJoinsRight = false;
 			}
-			else if (next && (dst[dstIndex] = Arabic_GetMap(cur, INITIAL)))
+
+			if (!shaped && next)
 			{
-				//printf("\tGot next:%x\n", dst[dstIndex]);
-				dstIndex++;				
+				// Only right joins: INITIAL
+				shaped = Arabic_GetMap(cur, INITIAL);
+				if (shaped)
+					curJoinsRight = Arabic_CanJoinRight(cur);
 			}
-			else
+
+			if (!shaped)
 			{
-				dst[dstIndex] = Arabic_GetMap(cur, ISOLATED);
-				//printf("\tGot nothing:%x\n", dst[dstIndex]);
-				dstIndex++;
+				// No joining: ISOLATED
+				shaped = Arabic_GetMap(cur, ISOLATED);
+				curJoinsRight = false;
 			}
+
+			if (!shaped)
+				shaped = cur; // Fallback to original if no mapping
+
+			if (dstIndex < dstLen)
+				dst[dstIndex++] = shaped;
+
+			// Update state for next character
+			prevJoins = curJoinsRight;
 		}
 		else
 		{
-			dst[dstIndex] = cur;
-			dstIndex++;
+			// Non-Arabic character: copy directly, breaks joining
+			if (dstIndex < dstLen)
+				dst[dstIndex++] = cur;
+			prevJoins = false;
 		}
 	}
+
 	return dstIndex;
 }

--- a/src/UserInterface/PythonSkill.cpp
+++ b/src/UserInterface/PythonSkill.cpp
@@ -1279,11 +1279,60 @@ float CPythonSkill::SSkillData::ProcessFormula(CPoly * pPoly, float fSkillLevel,
 	return pPoly->Eval();
 }

-static void ReplaceFirst(std::string& s, const char* needle, const std::string& repl)
+// Format specifiers supported in skill descriptions
+static const char* FORMAT_SPECIFIERS[] = {
+	"%.0f",  // Integer (no decimals)
+	"%.1f",  // 1 decimal place
+	"%.2f",  // 2 decimal places
+	"%d",    // Integer (alternative)
+};
+static const size_t FORMAT_SPECIFIER_COUNT = sizeof(FORMAT_SPECIFIERS) / sizeof(FORMAT_SPECIFIERS[0]);
+
+// Find and replace the first occurrence of any format specifier with the given value
+// Returns true if a replacement was made
+static bool ReplaceNextFormatSpecifier(std::string& s, float value)
 {
-	size_t pos = s.find(needle);
-	if (pos != std::string::npos)
-		s.replace(pos, strlen(needle), repl);
+	size_t bestPos = std::string::npos;
+	size_t bestLen = 0;
+	const char* bestSpec = nullptr;
+
+	// Find the first (leftmost) format specifier in the string
+	for (size_t i = 0; i < FORMAT_SPECIFIER_COUNT; ++i)
+	{
+		size_t pos = s.find(FORMAT_SPECIFIERS[i]);
+		if (pos != std::string::npos && (bestPos == std::string::npos || pos < bestPos))
+		{
+			bestPos = pos;
+			bestLen = strlen(FORMAT_SPECIFIERS[i]);
+			bestSpec = FORMAT_SPECIFIERS[i];
+		}
+	}
+
+	if (bestPos == std::string::npos)
+		return false;
+
+	// Format the value according to the specifier found
+	char szValue[64];
+	if (strcmp(bestSpec, "%.0f") == 0 || strcmp(bestSpec, "%d") == 0)
+		_snprintf(szValue, sizeof(szValue), "%.0f", floorf(value));
+	else if (strcmp(bestSpec, "%.1f") == 0)
+		_snprintf(szValue, sizeof(szValue), "%.1f", value);
+	else // %.2f or other
+		_snprintf(szValue, sizeof(szValue), "%.2f", value);
+
+	s.replace(bestPos, bestLen, szValue);
+	return true;
+}
+
+// Replace all occurrences of "%%" with "%" (escaped percent sign)
+static void UnescapePercent(std::string& s)
+{
+	size_t pos = 0;
+	while ((pos = s.find("%%", pos)) != std::string::npos)
+	{
+		s.replace(pos, 2, "%");
+		++pos; // Move past the replaced '%'
+	}
 }

 const char* CPythonSkill::SSkillData::GetAffectDescription(DWORD dwIndex, float fSkillLevel)
@@ -1303,33 +1352,20 @@ const char* CPythonSkill::SSkillData::GetAffectDescription(DWORD dwIndex, float
 	float fMinValue = ProcessFormula(&minPoly, fSkillLevel);
 	float fMaxValue = ProcessFormula(&maxPoly, fSkillLevel);

+	// Take absolute values
 	if (fMinValue < 0.0f) fMinValue = -fMinValue;
 	if (fMaxValue < 0.0f) fMaxValue = -fMaxValue;

-	const bool wantsInt = (desc.find("%.0f") != std::string::npos);
-	if (wantsInt)
-	{
-		fMinValue = floorf(fMinValue);
-		fMaxValue = floorf(fMaxValue);
-	}
-
-	char szMin[64], szMax[64];
-	if (wantsInt)
-	{
-		_snprintf(szMin, sizeof(szMin), "%.0f", fMinValue);
-		_snprintf(szMax, sizeof(szMax), "%.0f", fMaxValue);
-	}
-	else
-	{
-		_snprintf(szMin, sizeof(szMin), "%.2f", fMinValue);
-		_snprintf(szMax, sizeof(szMax), "%.2f", fMaxValue);
-	}
-
 	static std::string out;
 	out = desc;

-	ReplaceFirst(out, "%.0f", szMin);
-	ReplaceFirst(out, "%.0f", szMax);
+	// Replace format specifiers in order of appearance
+	// First specifier gets min value, second gets max value
+	ReplaceNextFormatSpecifier(out, fMinValue);
+	ReplaceNextFormatSpecifier(out, fMaxValue);
+
+	// Convert escaped %% to single % (for display like "30%")
+	UnescapePercent(out);

 	return out.c_str();
 }