* Add makefile to install data * Reference text is now formatted * UTF-8 string processing
125 lines
5.0 KiB
C
125 lines
5.0 KiB
C
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "utf8.h"
|
|
|
|
utf8iterator*
|
|
utf8NewIterator(const char* txt) {
|
|
utf8iterator* result = malloc(sizeof(utf8iterator));
|
|
memset(result, 0, sizeof(utf8iterator));
|
|
result->txt = txt;
|
|
return result;
|
|
}
|
|
|
|
uint32_t utf8CharAt(const utf8iterator* iter) {
|
|
int byteIndex = iter->byteIndex;
|
|
if ((iter->txt[byteIndex] & 0x80) == 0) {
|
|
return iter->txt[byteIndex];
|
|
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 1] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x1f) << 6);
|
|
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 2] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0xf) << 12);
|
|
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 3] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 12) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x07) << 18);
|
|
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 4] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 12) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 18) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x03) << 24);
|
|
|
|
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
|
|
((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
|
|
return (uint32_t)(iter->txt[byteIndex + 5] & 0x3f) |
|
|
((uint32_t)(iter->txt[byteIndex + 4] & 0x3f) << 6) |
|
|
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 12) |
|
|
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 18) |
|
|
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 24) |
|
|
((uint32_t)(iter->txt[byteIndex] & 0x01) << 30);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
_next_offset(const utf8iterator* iter) {
|
|
int byteIndex = iter->byteIndex;
|
|
if ((iter->txt[byteIndex] & 0x80) == 0) {
|
|
return 1;
|
|
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
|
|
return 2;
|
|
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
|
|
return 3;
|
|
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
|
|
return 4;
|
|
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
|
|
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
|
|
return 5;
|
|
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
|
|
((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) &&
|
|
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
|
|
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
|
|
return 6;
|
|
} else return -1;
|
|
}
|
|
|
|
int
|
|
utf8Advance(utf8iterator* iter) {
|
|
int next_offset = _next_offset(iter);
|
|
if (next_offset > 0) {
|
|
iter->byteIndex += next_offset;
|
|
return ++iter->logicalIndex;
|
|
} else return -1;
|
|
}
|
|
|
|
int
|
|
utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest) {
|
|
size_t sz = _next_offset(iter);
|
|
if (sz + 1 > bufsize) {
|
|
return 0;
|
|
} else {
|
|
memset(dest, 0, bufsize);
|
|
memcpy(dest, iter->txt + iter->byteIndex, sz);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
void
|
|
utf8ResetIterator(utf8iterator* iter) {
|
|
iter->logicalIndex = iter->byteIndex = 0;
|
|
}
|
|
|
|
void
|
|
utf8FreeIterator(utf8iterator* iter) {
|
|
free(iter);
|
|
}
|