/* * utf8.c * Copyright © 2016 David A. Baer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the organization nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY David A. Baer ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL David A. Baer BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include #include #include "utf8.h" utf8iterator* utf8NewIterator(const char* txt) { utf8iterator* result = malloc(sizeof(utf8iterator)); memset(result, 0, sizeof(utf8iterator)); result->txt = txt; return result; } uint32_t utf8CharAt(const utf8iterator* iter) { int byteIndex = iter->byteIndex; if ((iter->txt[byteIndex] & 0x80) == 0) { return iter->txt[byteIndex]; } else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) { return (uint32_t)(iter->txt[byteIndex + 1] & 0x3f) | ((uint32_t)(iter->txt[byteIndex] & 0x1f) << 6); } else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) { return (uint32_t)(iter->txt[byteIndex + 2] & 0x3f) | ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 6) | ((uint32_t)(iter->txt[byteIndex] & 0xf) << 12); } else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) { return (uint32_t)(iter->txt[byteIndex + 3] & 0x3f) | ((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 6) | ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 12) | ((uint32_t)(iter->txt[byteIndex] & 0x07) << 18); } else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) { return (uint32_t)(iter->txt[byteIndex + 4] & 0x3f) | ((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 6) | ((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 12) | ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 18) | ((uint32_t)(iter->txt[byteIndex] & 0x03) << 24); } else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 4] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) { return (uint32_t)(iter->txt[byteIndex + 5] & 0x3f) | ((uint32_t)(iter->txt[byteIndex + 4] & 0x3f) << 6) | ((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 12) | ((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 18) | ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 24) | ((uint32_t)(iter->txt[byteIndex] & 0x01) << 30); } return 0; } static int _next_offset(const utf8iterator* iter) { int byteIndex = iter->byteIndex; if ((iter->txt[byteIndex] & 0x80) == 0) { return 1; } else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) { return 2; } else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) { return 3; } else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) { return 4; } else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) { return 5; } else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) && ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 4] & 0xc0) == 0x80) && ((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) { return 6; } else return -1; } int utf8Advance(utf8iterator* iter) { int next_offset = _next_offset(iter); if (next_offset > 0) { iter->byteIndex += next_offset; return ++iter->logicalIndex; } else return -1; } int utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest) { size_t sz = _next_offset(iter); if (sz + 1 > bufsize) { return 0; } else { memset(dest, 0, bufsize); memcpy(dest, iter->txt + iter->byteIndex, sz); return 1; } } void utf8ResetIterator(utf8iterator* iter) { iter->logicalIndex = iter->byteIndex = 0; } void utf8FreeIterator(utf8iterator* iter) { free(iter); }