aboutsummaryrefslogtreecommitdiff
path: root/source/string.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'source/string.cpp')
-rw-r--r--source/string.cpp464
1 files changed, 464 insertions, 0 deletions
diff --git a/source/string.cpp b/source/string.cpp
new file mode 100644
index 0000000..ae73f11
--- /dev/null
+++ b/source/string.cpp
@@ -0,0 +1,464 @@
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (c) Electronic Arts Inc. All rights reserved.
+///////////////////////////////////////////////////////////////////////////////
+
+
+#include <EASTL/internal/config.h>
+#include <EASTL/string.h>
+#include <EABase/eabase.h>
+#include <string.h>
+
+
+namespace eastl
+{
+ ///////////////////////////////////////////////////////////////////////////////
+ // Converters for DecodePart
+ //
+ // For some decent documentation about conversions, see:
+ // http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c
+ //
+ ///////////////////////////////////////////////////////////////////////////////
+
+ // Requires that pDest have a capacity of at least 6 chars.
+ // Sets pResult to '\1' in the case that c is an invalid UCS4 char.
+ inline bool UCS4ToUTF8(uint32_t c, char*& pResult)
+ {
+ if(c < 0x00000080)
+ *pResult++ = (char)(uint8_t)c;
+ else if(c < 0x0800)
+ {
+ *pResult++ = (char)(uint8_t)(0xC0 | (c >> 6));
+ *pResult++ = (char)(uint8_t)(0x80 | (c & 0x3F));
+ }
+ else if(c <= 0x0000FFFF)
+ {
+ *pResult++ = (char)(uint8_t)(0xE0 | (c >> 12));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 6) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | (c & 0x3F));
+ }
+ else if(c <= 0x001FFFFF)
+ {
+ *pResult++ = (char)(uint8_t)(0xF0 | (c >> 18));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 12) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 6) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | (c & 0x3F));
+ }
+ else if(c <= 0x003FFFFFF)
+ {
+ *pResult++ = (char)(uint8_t)(0xF8 | (c >> 24));
+ *pResult++ = (char)(uint8_t)(0x80 | (c >> 18));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 12) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 6) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | (c & 0x3F));
+ }
+ else if(c <= 0x7FFFFFFF)
+ {
+ *pResult++ = (char)(uint8_t)(0xFC | (c >> 30));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 24) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 18) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 12) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | ((c >> 6) & 0x3F));
+ *pResult++ = (char)(uint8_t)(0x80 | (c & 0x3F));
+ }
+ else
+ {
+ // values >= 0x80000000 can't be converted to UTF8.
+ *pResult++ = '\1';
+ return false;
+ }
+
+ return true;
+ }
+
+
+ // Requires that pResult have a capacity of at least 3 chars.
+ // Sets pResult to '\1' in the case that c is an invalid UCS4 char.
+ inline bool UCS2ToUTF8(uint16_t c, char*& pResult)
+ {
+ return UCS4ToUTF8(c, pResult);
+ }
+
+
+ // Sets result to 0xffff in the case that the input UTF8 sequence is bad.
+ // 32 bit 0xffffffff is an invalid UCS4 code point, so we can't use that as an error return value.
+ inline bool UTF8ToUCS4(const char*& p, const char* pEnd, uint32_t& result)
+ {
+ // This could likely be implemented in a faster-executing way that uses tables.
+
+ bool success = true;
+ uint32_t c = 0xffff;
+ const char* pNext = NULL;
+
+ if(p < pEnd)
+ {
+ uint8_t cChar0((uint8_t)*p), cChar1, cChar2, cChar3;
+
+ // Asserts are disabled because we don't necessarily want to interrupt runtime execution due to this.
+ // EASTL_ASSERT((cChar0 != 0xFE) && (cChar0 != 0xFF)); // No byte can be 0xFE or 0xFF
+ // Code below will effectively catch this error as it goes.
+
+ if(cChar0 < 0x80)
+ {
+ pNext = p + 1;
+ c = cChar0;
+ }
+ else
+ {
+ //EASTL_ASSERT((cChar0 & 0xC0) == 0xC0); // The top two bits need to be equal to 1
+ if((cChar0 & 0xC0) != 0xC0)
+ {
+ success = false;
+ goto Failure;
+ }
+
+ if((cChar0 & 0xE0) == 0xC0)
+ {
+ pNext = p + 2;
+
+ if(pNext <= pEnd)
+ {
+ c = (uint32_t)((cChar0 & 0x1F) << 6);
+ cChar1 = static_cast<uint8_t>(p[1]);
+ c |= cChar1 & 0x3F;
+
+ //EASTL_ASSERT((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
+ //EASTL_ASSERT(c >= 0x0080 && c < 0x0800); // Check that we have the smallest coding
+ if(!((cChar1 & 0xC0) == 0x80) ||
+ !(c >= 0x0080 && c < 0x0800))
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else if((cChar0 & 0xF0) == 0xE0)
+ {
+ pNext = p + 3;
+
+ if(pNext <= pEnd)
+ {
+ c = (uint32_t)((cChar0 & 0xF) << 12);
+ cChar1 = static_cast<uint8_t>(p[1]);
+ c |= (cChar1 & 0x3F) << 6;
+ cChar2 = static_cast<uint8_t>(p[2]);
+ c |= cChar2 & 0x3F;
+
+ //EASTL_ASSERT((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
+ //EASTL_ASSERT((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
+ //EASTL_ASSERT(c >= 0x00000800 && c < 0x00010000); // Check that we have the smallest coding
+ if(!((cChar1 & 0xC0) == 0x80) ||
+ !((cChar2 & 0xC0) == 0x80) ||
+ !(c >= 0x00000800 && c < 0x00010000))
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else if((cChar0 & 0xF8) == 0xF0)
+ {
+ pNext = p + 4;
+
+ if(pNext <= pEnd)
+ {
+ c = (uint32_t)((cChar0 & 0x7) << 18);
+ cChar1 = static_cast<uint8_t>(p[1]);
+ c |= (uint32_t)((cChar1 & 0x3F) << 12);
+ cChar2 = static_cast<uint8_t>(p[2]);
+ c |= (cChar2 & 0x3F) << 6;
+ cChar3 = static_cast<uint8_t>(p[3]);
+ c |= cChar3 & 0x3F;
+
+ //EASTL_ASSERT((cChar0 & 0xf8) == 0xf0); // We handle the unicode but not UCS-4
+ //EASTL_ASSERT((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
+ //EASTL_ASSERT((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
+ //EASTL_ASSERT((cChar3 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
+ //EASTL_ASSERT(c >= 0x00010000 && c <= 0x0010FFFF); // Check that we have the smallest coding, Unicode and not ucs-4
+ if(!((cChar0 & 0xf8) == 0xf0) ||
+ !((cChar1 & 0xC0) == 0x80) ||
+ !((cChar2 & 0xC0) == 0x80) ||
+ !(c >= 0x00010000 && c <= 0x0010FFFF))
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else if((cChar0 & 0xFC) == 0xF8)
+ {
+ pNext = p + 4;
+
+ if(pNext <= pEnd)
+ {
+ // To do. We don't currently support extended UCS4 characters.
+ }
+ else
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else if((cChar0 & 0xFE) == 0xFC)
+ {
+ pNext = p + 5;
+
+ if(pNext <= pEnd)
+ {
+ // To do. We don't currently support extended UCS4 characters.
+ }
+ else
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ else
+ {
+ success = false;
+ goto Failure;
+ }
+ }
+ }
+ else
+ success = false;
+
+ Failure:
+ if(success)
+ {
+ p = pNext;
+ result = c;
+ }
+ else
+ {
+ p = p + 1;
+ result = 0xffff;
+ }
+
+ return success;
+ }
+
+ // Sets result to 0xffff in the case that the input UTF8 sequence is bad.
+ // The effect of converting UTF8 codepoints > 0xffff to UCS2 (char16_t) is to set all
+ // such codepoints to 0xffff. EASTL doesn't have a concept of setting or maintaining
+ // error state for string conversions, though it does have a policy of converting
+ // impossible values to something without generating invalid strings or throwing exceptions.
+ inline bool UTF8ToUCS2(const char*& p, const char* pEnd, uint16_t& result)
+ {
+ uint32_t u32;
+
+ if(UTF8ToUCS4(p, pEnd, u32))
+ {
+ if(u32 <= 0xffff)
+ {
+ result = (uint16_t)u32;
+ return true;
+ }
+ }
+
+ result = 0xffff;
+ return false;
+ }
+
+
+
+ ///////////////////////////////////////////////////////////////////////////
+ // DecodePart
+ ///////////////////////////////////////////////////////////////////////////
+
+ EASTL_API bool DecodePart(const char*& pSrc, const char* pSrcEnd, char*& pDest, char* pDestEnd)
+ {
+ size_t sourceSize = (size_t)(pSrcEnd - pSrc);
+ size_t destSize = (size_t)(pDestEnd - pDest);
+
+ if(sourceSize > destSize)
+ sourceSize = destSize;
+
+ memmove(pDest, pSrc, sourceSize * sizeof(*pSrcEnd));
+
+ pSrc += sourceSize;
+ pDest += sourceSize; // Intentionally add sourceSize here.
+
+ return true;
+ }
+
+ EASTL_API bool DecodePart(const char*& pSrc, const char* pSrcEnd, char16_t*& pDest, char16_t* pDestEnd)
+ {
+ bool success = true;
+
+ while(success && (pSrc < pSrcEnd) && (pDest < pDestEnd))
+ success = UTF8ToUCS2(pSrc, pSrcEnd, (uint16_t&)*pDest++);
+
+ return success;
+ }
+
+ EASTL_API bool DecodePart(const char*& pSrc, const char* pSrcEnd, char32_t*& pDest, char32_t* pDestEnd)
+ {
+ bool success = true;
+
+ while(success && (pSrc < pSrcEnd) && (pDest < pDestEnd))
+ success = UTF8ToUCS4(pSrc, pSrcEnd, (uint32_t&)*pDest++);
+
+ return success;
+ }
+
+
+ EASTL_API bool DecodePart(const char16_t*& pSrc, const char16_t* pSrcEnd, char*& pDest, char* pDestEnd)
+ {
+ bool success = true;
+
+ EASTL_ASSERT((pDest + 6) < pDestEnd); // The user must provide ample buffer space, preferably 256 chars or more.
+ pDestEnd -= 6; // Do this so that we can avoid dest buffer size checking in the loop below and the function it calls.
+
+ while(success && (pSrc < pSrcEnd) && (pDest < pDestEnd))
+ success = UCS2ToUTF8(*pSrc++, pDest);
+
+ return success;
+ }
+
+ EASTL_API bool DecodePart(const char16_t*& pSrc, const char16_t* pSrcEnd, char16_t*& pDest, char16_t* pDestEnd)
+ {
+ size_t sourceSize = (size_t)(pSrcEnd - pSrc);
+ size_t destSize = (size_t)(pDestEnd - pDest);
+
+ if(sourceSize > destSize)
+ sourceSize = destSize;
+
+ memmove(pDest, pSrc, sourceSize * sizeof(*pSrcEnd));
+
+ pSrc += sourceSize;
+ pDest += sourceSize; // Intentionally add sourceSize here.
+
+ return true;
+ }
+
+ EASTL_API bool DecodePart(const char16_t*& pSrc, const char16_t* pSrcEnd, char32_t*& pDest, char32_t* pDestEnd)
+ {
+ size_t sourceSize = (size_t)(pSrcEnd - pSrc);
+ size_t destSize = (size_t)(pDestEnd - pDest);
+
+ if(sourceSize > destSize)
+ pSrcEnd = pSrc + destSize;
+
+ while(pSrc != pSrcEnd) // To consider: Improve this by unrolling this loop. Other tricks can improve its speed as well.
+ *pDest++ = (char32_t)*pSrc++;
+
+ return true;
+ }
+
+
+ EASTL_API bool DecodePart(const char32_t*& pSrc, const char32_t* pSrcEnd, char*& pDest, char* pDestEnd)
+ {
+ bool success = true;
+
+ EASTL_ASSERT((pDest + 6) < pDestEnd); // The user must provide ample buffer space, preferably 256 chars or more.
+ pDestEnd -= 6; // Do this so that we can avoid dest buffer size checking in the loop below and the function it calls.
+
+ while(success && (pSrc < pSrcEnd) && (pDest < pDestEnd))
+ success = UCS4ToUTF8(*pSrc++, pDest);
+
+ return success;
+ }
+
+ EASTL_API bool DecodePart(const char32_t*& pSrc, const char32_t* pSrcEnd, char16_t*& pDest, char16_t* pDestEnd)
+ {
+ size_t sourceSize = (size_t)(pSrcEnd - pSrc);
+ size_t destSize = (size_t)(pDestEnd - pDest);
+
+ if(sourceSize > destSize)
+ pSrcEnd = pSrc + destSize;
+
+ while(pSrc != pSrcEnd) // To consider: Improve this by unrolling this loop. Other tricks can improve its speed as well.
+ *pDest++ = (char16_t)*pSrc++; // This is potentially losing data. We are not converting to UTF16; we are converting to UCS2.
+
+ return true;
+ }
+
+ EASTL_API bool DecodePart(const char32_t*& pSrc, const char32_t* pSrcEnd, char32_t*& pDest, char32_t* pDestEnd)
+ {
+ size_t sourceSize = (size_t)(pSrcEnd - pSrc);
+ size_t destSize = (size_t)(pDestEnd - pDest);
+
+ if(sourceSize > destSize)
+ sourceSize = destSize;
+
+ memmove(pDest, pSrc, sourceSize * sizeof(*pSrcEnd));
+
+ pSrc += sourceSize;
+ pDest += sourceSize; // Intentionally add sourceSize here.
+
+ return true;
+ }
+
+ EASTL_API bool DecodePart(const int*& pSrc, const int* pSrcEnd, char*& pDest, char* pDestEnd)
+ {
+ bool success = true;
+
+ EASTL_ASSERT((pDest + 6) < pDestEnd); // The user must provide ample buffer space, preferably 256 chars or more.
+ pDestEnd -= 6; // Do this so that we can avoid dest buffer size checking in the loop below and the function it calls.
+
+ while(success && (pSrc < pSrcEnd) && (pDest < pDestEnd))
+ success = UCS4ToUTF8((uint32_t)(unsigned)*pSrc++, pDest);
+
+ return success;
+ }
+
+ EASTL_API bool DecodePart(const int*& pSrc, const int* pSrcEnd, char16_t*& pDest, char16_t* pDestEnd)
+ {
+ size_t sourceSize = (size_t)(pSrcEnd - pSrc);
+ size_t destSize = (size_t)(pDestEnd - pDest);
+
+ if(sourceSize > destSize)
+ pSrcEnd = pSrc + destSize;
+
+ while(pSrc != pSrcEnd) // To consider: Improve this by unrolling this loop. Other tricks can improve its speed as well.
+ *pDest++ = (char16_t)*pSrc++; // This is potentially losing data. We are not converting to UTF16; we are converting to UCS2.
+
+ return true;
+ }
+
+ EASTL_API bool DecodePart(const int*& pSrc, const int* pSrcEnd, char32_t*& pDest, char32_t* pDestEnd)
+ {
+ size_t sourceSize = (size_t)(pSrcEnd - pSrc);
+ size_t destSize = (size_t)(pDestEnd - pDest);
+
+ if(sourceSize > destSize)
+ pSrcEnd = pSrc + destSize;
+
+ while(pSrc != pSrcEnd) // To consider: Improve this by unrolling this loop. Other tricks can improve its speed as well.
+ *pDest++ = (char32_t)*pSrc++; // This is potentially losing data. We are not converting to UTF16; we are converting to UCS2.
+
+ return true;
+ }
+
+
+
+} // namespace eastl
+
+
+
+
+
+
+
+
+
+
+
+
+
+