Files
sermon/src/utf8.c
2017-01-22 19:52:14 -05:00

154 lines
6.5 KiB
C

/*
* utf8.c
* Copyright © 2016 David A. Baer
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the organization nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY David A. Baer ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL David A. Baer BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdlib.h>
#include <string.h>
#include "utf8.h"
utf8iterator*
utf8NewIterator(const char* txt) {
utf8iterator* result = malloc(sizeof(utf8iterator));
memset(result, 0, sizeof(utf8iterator));
result->txt = txt;
return result;
}
uint32_t utf8CharAt(const utf8iterator* iter) {
int byteIndex = iter->byteIndex;
if ((iter->txt[byteIndex] & 0x80) == 0) {
return iter->txt[byteIndex];
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 1] & 0x3f) |
((uint32_t)(iter->txt[byteIndex] & 0x1f) << 6);
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 2] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex] & 0xf) << 12);
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 3] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 12) |
((uint32_t)(iter->txt[byteIndex] & 0x07) << 18);
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 4] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 12) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 18) |
((uint32_t)(iter->txt[byteIndex] & 0x03) << 24);
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 5] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 4] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 12) |
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 18) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 24) |
((uint32_t)(iter->txt[byteIndex] & 0x01) << 30);
}
return 0;
}
static int
_next_offset(const utf8iterator* iter) {
int byteIndex = iter->byteIndex;
if ((iter->txt[byteIndex] & 0x80) == 0) {
return 1;
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
return 2;
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
return 3;
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
return 4;
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
return 5;
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
return 6;
} else return -1;
}
int
utf8Advance(utf8iterator* iter) {
int next_offset = _next_offset(iter);
if (next_offset > 0) {
iter->byteIndex += next_offset;
return ++iter->logicalIndex;
} else return -1;
}
int
utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest) {
size_t sz = _next_offset(iter);
if (sz + 1 > bufsize) {
return 0;
} else {
memset(dest, 0, bufsize);
memcpy(dest, iter->txt + iter->byteIndex, sz);
return 1;
}
}
void
utf8ResetIterator(utf8iterator* iter) {
iter->logicalIndex = iter->byteIndex = 0;
}
void
utf8FreeIterator(utf8iterator* iter) {
free(iter);
}