Use stylesheet transformations to output HTML, for example

This commit is contained in:
David Baer
2015-08-11 15:33:08 -04:00
parent cfc0ba7e9a
commit 10927bee25
19 changed files with 725 additions and 92 deletions

208
src/format.c Normal file
View File

@@ -0,0 +1,208 @@
#include <string.h>
#include "queue.h"
#include "stack.h"
#include "utf8.h"
#include "format.h"
DEFINE_QUEUE(FormatElement, FormatElementQueue);
#define EM_DASH_UTF8 "\xe2\x80\x94"
typedef utf8iterator* Tokenizer;
typedef enum {
TOK_TEXT,
TOK_GREEK,
TOK_UNICODE,
TOK_STAR,
TOK_REF,
/*
TOK_DASH,
TOK_OPEN_DOUBLE_QUOTE,
TOK_CLOSE_DOUBLE_QUOTE,
TOK_OPEN_SINGLE_QUOTE,
TOK_CLOSE_SINGLE_QUOTE,
*/
TOK_EOF
} TokenType;
typedef struct {
TokenType toktype;
char* toktext;
} Token;
static Tokenizer
initializeTokenizer(const char* txt) {
return utf8NewIterator(txt);
}
static void
freeTokenizer(utf8iterator* iter) {
utf8FreeIterator(iter);
}
inline int
greekChar(uint32_t ch) {
return (((0x370 <= ch) && (ch <= 0x3ff)) ||
((0x1f00 <= ch) && (ch <= 0x1fff)));
}
inline int
extendedPunctuation(uint32_t ch) {
return ((0x2000 <= ch) && (ch <= 0x206f));
}
inline int
latinChar(uint32_t ch) {
return (ch <= 0xff) || extendedPunctuation(ch);
}
static Token
nextToken(Tokenizer tokenizer) {
int startIndex = tokenizer->byteIndex;
uint32_t ch = utf8CharAt(tokenizer);
Token result;
memset(&result, 0, sizeof(result));
if (ch == '\0') {
result.toktype = TOK_EOF;
result.toktext = NULL;
return result;
} else if (ch == '*') {
utf8Advance(tokenizer);
result.toktype = TOK_STAR;
result.toktext = NULL;
return result;
} else if (greekChar(ch)) {
while ((ch != 0) &&
(greekChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.'))) {
utf8Advance(tokenizer);
ch = utf8CharAt(tokenizer);
}
result.toktype = TOK_GREEK;
result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
return result;
} else if ((ch == '^') && (tokenizer->txt[tokenizer->byteIndex + 1] == '{')) {
int idStart = 0, idEnd = 0;
utf8Advance(tokenizer); /* to { */
utf8Advance(tokenizer); /* to id */
ch = utf8CharAt(tokenizer);
idStart = tokenizer->byteIndex;
while ((ch != 0) && (ch != '}')) {
utf8Advance(tokenizer);
ch = utf8CharAt(tokenizer);
}
idEnd = tokenizer->byteIndex;
/* reached end-of-string or } */
if (ch == '}')
utf8Advance(tokenizer);
result.toktype = TOK_REF;
result.toktext = strndup(tokenizer->txt + idStart, idEnd - idStart);
return result;
} else if (latinChar(ch)) {
while ((ch != 0) && latinChar(ch) && (ch != '*')) {
utf8Advance(tokenizer);
ch = utf8CharAt(tokenizer);
if (ch == '^') {
if (tokenizer->txt[tokenizer->byteIndex + 1] == '{') break;
}
}
result.toktype = TOK_TEXT;
result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
return result;
} else {
while ((ch != 0) && (!latinChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.') || extendedPunctuation(ch))) {
utf8Advance(tokenizer);
ch = utf8CharAt(tokenizer);
}
result.toktype = TOK_UNICODE;
result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
return result;
}
}
DEFINE_STACK(FormatElementQueue, FEQueueStack);
int formatText(const char* txt, FormatElement** dst, CitationRecordQueue* citationQPtr) {
Tokenizer tokenizer = initializeTokenizer(txt);
Token tok;
int listLength = 0, em = 0;
NEW_QUEUE(FormatElementQueue, formatElementQ);
NEW_STACK(FEQueueStack, st);
while ((tok = nextToken(tokenizer)).toktype != TOK_EOF) {
if (tok.toktype == TOK_STAR) {
if (em) { /* end emphasis */
FormatElement* content = NULL;
FormatElement elt = { .elementType = FORMAT_EM };
QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
elt.elementContent.nestedContent = content;
formatElementQ = STACK_HEAD(st);
APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
POP_STACK(FEQueueStack, st);
em = 0;
} else { /* begin emphasis */
PUSH_STACK(FEQueueStack, st, formatElementQ);
REINIT_QUEUE(formatElementQ);
em = 1;
}
} else {
FormatElementType t;
FormatElement elt = { .elementContent = { .textContent = tok.toktext } } ;
if (tok.toktype == TOK_TEXT) {
t = FORMAT_TEXT;
} else if (tok.toktype == TOK_GREEK) {
t = FORMAT_GREEK;
} else if (tok.toktype == TOK_UNICODE) {
t = FORMAT_UNICODE;
} else if (tok.toktype == TOK_REF) {
t = FORMAT_CITATION;
if (citationQPtr && !lookupCitation(*citationQPtr, tok.toktext)) {
addCitation(citationQPtr, tok.toktext);
}
}
elt.elementType = t;
APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
}
}
if (em) {
/* unmatched star -- close the emphasis here */
FormatElement* content = NULL;
FormatElement elt = { .elementType = FORMAT_EM };
QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
elt.elementContent.nestedContent = content;
formatElementQ = STACK_HEAD(st);
APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
POP_STACK(FEQueueStack, st);
}
QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, *dst);
listLength = QUEUE_LENGTH(formatElementQ);
DESTROY_QUEUE(FormatElementQueue, formatElementQ);
freeTokenizer(tokenizer);
return listLength;
}
void freeFormatElementArray(FormatElement* a, int length) {
int i = 0;
for (i = 0; i < length; i++) {
if (a[i].elementType == FORMAT_EM) {
freeFormatElementArray(a[i].elementContent.nestedContent,
a[i].elementContentLength);
} else {
free(a[i].elementContent.textContent);
}
}
free(a);
}
#ifdef FORMATTER_TEST
#include <stdio.h>
const char* str = "My name in Chinese is \xe7\x86\x8a\xe5\xa4\xa7\xe8\xa1\x9b, or *xiao da wei*. My favorite Greek passage is \xe1\xbc\x90\xce\xbd \xe1\xbc\x80\xcf\x81\xcf\x87\xe1\xbf\x87 \xe1\xbc\xa6\xce\xbd \xe1\xbd\x81 \xce\xbb\xe1\xbd\xb9\xce\xb3\xce\xbf\xcf\x82.^{cite}";
int
main() {
FormatElement* lst;
int l = formatText(str, &lst);
return 0;
}
#endif