Numerous updates:
* Add makefile to install data * Reference text is now formatted * UTF-8 string processing
This commit is contained in:
124
src/utf8.c
Normal file
124
src/utf8.c
Normal file
@@ -0,0 +1,124 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "utf8.h"
|
||||
|
||||
utf8iterator*
|
||||
utf8NewIterator(const char* txt) {
|
||||
utf8iterator* result = malloc(sizeof(utf8iterator));
|
||||
memset(result, 0, sizeof(utf8iterator));
|
||||
result->txt = txt;
|
||||
return result;
|
||||
}
|
||||
|
||||
uint32_t utf8CharAt(const utf8iterator* iter) {
|
||||
int byteIndex = iter->byteIndex;
|
||||
if ((iter->txt[byteIndex] & 0x80) == 0) {
|
||||
return iter->txt[byteIndex];
|
||||
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
|
||||
return (uint32_t)(iter->txt[byteIndex + 1] & 0x3f) |
|
||||
((uint32_t)(iter->txt[byteIndex] & 0x1f) << 6);
|
||||
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
|
||||
return (uint32_t)(iter->txt[byteIndex + 2] & 0x3f) |
|
||||
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 6) |
|
||||
((uint32_t)(iter->txt[byteIndex] & 0xf) << 12);
|
||||
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
|
||||
return (uint32_t)(iter->txt[byteIndex + 3] & 0x3f) |
|
||||
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 6) |
|
||||
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 12) |
|
||||
((uint32_t)(iter->txt[byteIndex] & 0x07) << 18);
|
||||
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
|
||||
return (uint32_t)(iter->txt[byteIndex + 4] & 0x3f) |
|
||||
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 6) |
|
||||
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 12) |
|
||||
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 18) |
|
||||
((uint32_t)(iter->txt[byteIndex] & 0x03) << 24);
|
||||
|
||||
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
|
||||
((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
|
||||
return (uint32_t)(iter->txt[byteIndex + 5] & 0x3f) |
|
||||
((uint32_t)(iter->txt[byteIndex + 4] & 0x3f) << 6) |
|
||||
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 12) |
|
||||
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 18) |
|
||||
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 24) |
|
||||
((uint32_t)(iter->txt[byteIndex] & 0x01) << 30);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
_next_offset(const utf8iterator* iter) {
|
||||
int byteIndex = iter->byteIndex;
|
||||
if ((iter->txt[byteIndex] & 0x80) == 0) {
|
||||
return 1;
|
||||
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
|
||||
return 2;
|
||||
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
|
||||
return 3;
|
||||
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
|
||||
return 4;
|
||||
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
|
||||
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
|
||||
return 5;
|
||||
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
|
||||
((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) &&
|
||||
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
|
||||
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
|
||||
return 6;
|
||||
} else return -1;
|
||||
}
|
||||
|
||||
int
|
||||
utf8Advance(utf8iterator* iter) {
|
||||
int next_offset = _next_offset(iter);
|
||||
if (next_offset > 0) {
|
||||
iter->byteIndex += next_offset;
|
||||
return ++iter->logicalIndex;
|
||||
} else return -1;
|
||||
}
|
||||
|
||||
int
|
||||
utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest) {
|
||||
size_t sz = _next_offset(iter);
|
||||
if (sz + 1 > bufsize) {
|
||||
return 0;
|
||||
} else {
|
||||
memset(dest, 0, bufsize);
|
||||
memcpy(dest, iter->txt + iter->byteIndex, sz);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
utf8ResetIterator(utf8iterator* iter) {
|
||||
iter->logicalIndex = iter->byteIndex = 0;
|
||||
}
|
||||
|
||||
void
|
||||
utf8FreeIterator(utf8iterator* iter) {
|
||||
free(iter);
|
||||
}
|
||||
Reference in New Issue
Block a user