Numerous updates:

* Add makefile to install data

* Reference text is now formatted

* UTF-8 string processing
This commit is contained in:
David Baer
2015-08-06 14:31:38 -04:00
parent af38b1eafc
commit cfc0ba7e9a
13 changed files with 269 additions and 7 deletions

124
src/utf8.c Normal file
View File

@@ -0,0 +1,124 @@
#include <stdlib.h>
#include <string.h>
#include "utf8.h"
utf8iterator*
utf8NewIterator(const char* txt) {
utf8iterator* result = malloc(sizeof(utf8iterator));
memset(result, 0, sizeof(utf8iterator));
result->txt = txt;
return result;
}
uint32_t utf8CharAt(const utf8iterator* iter) {
int byteIndex = iter->byteIndex;
if ((iter->txt[byteIndex] & 0x80) == 0) {
return iter->txt[byteIndex];
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 1] & 0x3f) |
((uint32_t)(iter->txt[byteIndex] & 0x1f) << 6);
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 2] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex] & 0xf) << 12);
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 3] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 12) |
((uint32_t)(iter->txt[byteIndex] & 0x07) << 18);
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 4] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 12) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 18) |
((uint32_t)(iter->txt[byteIndex] & 0x03) << 24);
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
return (uint32_t)(iter->txt[byteIndex + 5] & 0x3f) |
((uint32_t)(iter->txt[byteIndex + 4] & 0x3f) << 6) |
((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 12) |
((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 18) |
((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 24) |
((uint32_t)(iter->txt[byteIndex] & 0x01) << 30);
}
return 0;
}
static int
_next_offset(const utf8iterator* iter) {
int byteIndex = iter->byteIndex;
if ((iter->txt[byteIndex] & 0x80) == 0) {
return 1;
} else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) {
return 2;
} else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) {
return 3;
} else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) {
return 4;
} else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) &&
((iter->txt[byteIndex + 1] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) {
return 5;
} else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) &&
((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) &&
((iter->txt[byteIndex + 2] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 3] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 4] & 0xc0) == 0x80) &&
((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) {
return 6;
} else return -1;
}
int
utf8Advance(utf8iterator* iter) {
int next_offset = _next_offset(iter);
if (next_offset > 0) {
iter->byteIndex += next_offset;
return ++iter->logicalIndex;
} else return -1;
}
int
utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest) {
size_t sz = _next_offset(iter);
if (sz + 1 > bufsize) {
return 0;
} else {
memset(dest, 0, bufsize);
memcpy(dest, iter->txt + iter->byteIndex, sz);
return 1;
}
}
void
utf8ResetIterator(utf8iterator* iter) {
iter->logicalIndex = iter->byteIndex = 0;
}
void
utf8FreeIterator(utf8iterator* iter) {
free(iter);
}