154 lines
6.5 KiB
C
154 lines
6.5 KiB
C
/*
|
|
* utf8.c
|
|
* Copyright © 2016 David A. Baer
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the organization nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY David A. Baer ''AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL David A. Baer BE LIABLE FOR ANY
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "utf8.h"
|
|
|
|
utf8iterator*
|
|
utf8NewIterator(const char* txt) {
|
|
utf8iterator* result = malloc(sizeof(utf8iterator));
|
|
memset(result, 0, sizeof(utf8iterator));
|
|
result->txt = txt;
|
|
return result;
|
|
}
|
|
|
|
uint32_t utf8CharAt(const utf8iterator* iter) {
|
|
int byteIndex = iter->byteIndex;
|
|
if ((iter->txt[byteIndex] & 0x80) == 0) {
|
|
return iter->txt[byteIndex];
|
|
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 1] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x1f) << 6);
|
|
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 2] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0xf) << 12);
|
|
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 3] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 12) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x07) << 18);
|
|
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 4] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 12) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 18) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x03) << 24);
|
|
|
|
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 5] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 4] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 12) |
|
|
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 18) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 24) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x01) << 30);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
_next_offset(const utf8iterator* iter) {
|
|
int byteIndex = iter->byteIndex;
|
|
if ((iter->txt[byteIndex] & 0x80) == 0) {
|
|
return 1;
|
|
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
|
|
return 2;
|
|
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
|
|
return 3;
|
|
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
|
|
return 4;
|
|
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
|
|
return 5;
|
|
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
|
|
return 6;
|
|
} else return -1;
|
|
}
|
|
|
|
int
|
|
utf8Advance(utf8iterator* iter) {
|
|
int next_offset = _next_offset(iter);
|
|
if (next_offset > 0) {
|
|
iter->byteIndex += next_offset;
|
|
return ++iter->logicalIndex;
|
|
} else return -1;
|
|
}
|
|
|
|
int
|
|
utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest) {
|
|
size_t sz = _next_offset(iter);
|
|
if (sz + 1 > bufsize) {
|
|
return 0;
|
|
} else {
|
|
memset(dest, 0, bufsize);
|
|
memcpy(dest, iter->txt + iter->byteIndex, sz);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
void
|
|
utf8ResetIterator(utf8iterator* iter) {
|
|
iter->logicalIndex = iter->byteIndex = 0;
|
|
}
|
|
|
|
void
|
|
utf8FreeIterator(utf8iterator* iter) {
|
|
free(iter);
|
|
}
|