226 lines
7.8 KiB
C
226 lines
7.8 KiB
C
/*
|
|
* format.c
|
|
* Copyright © 2015 David A. Baer
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the organization nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY David A. Baer ''AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL David A. Baer BE LIABLE FOR ANY
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include <string.h>
|
|
#include "queue.h"
|
|
#include "stack.h"
|
|
#include "utf8.h"
|
|
#include "format.h"
|
|
|
|
DEFINE_QUEUE(FormatElement, FormatElementQueue);
|
|
|
|
#define EM_DASH_UTF8 "\xe2\x80\x94"
|
|
|
|
typedef utf8iterator* Tokenizer;
|
|
typedef enum {
|
|
TOK_TEXT,
|
|
TOK_GREEK,
|
|
TOK_UNICODE,
|
|
TOK_STAR,
|
|
TOK_REF,
|
|
/*
|
|
TOK_DASH,
|
|
TOK_OPEN_DOUBLE_QUOTE,
|
|
TOK_CLOSE_DOUBLE_QUOTE,
|
|
TOK_OPEN_SINGLE_QUOTE,
|
|
TOK_CLOSE_SINGLE_QUOTE,
|
|
*/
|
|
TOK_EOF
|
|
} TokenType;
|
|
typedef struct {
|
|
TokenType toktype;
|
|
char* toktext;
|
|
} Token;
|
|
|
|
static Tokenizer
|
|
initializeTokenizer(const char* txt) {
|
|
return utf8NewIterator(txt);
|
|
}
|
|
|
|
static void
|
|
freeTokenizer(utf8iterator* iter) {
|
|
utf8FreeIterator(iter);
|
|
}
|
|
|
|
inline int
|
|
greekChar(uint32_t ch) {
|
|
return (((0x370 <= ch) && (ch <= 0x3ff)) ||
|
|
((0x1f00 <= ch) && (ch <= 0x1fff)));
|
|
}
|
|
|
|
inline int
|
|
extendedPunctuation(uint32_t ch) {
|
|
return ((0x2000 <= ch) && (ch <= 0x206f));
|
|
}
|
|
|
|
inline int
|
|
latinChar(uint32_t ch) {
|
|
return (ch <= 0xff) || extendedPunctuation(ch);
|
|
}
|
|
|
|
static Token
|
|
nextToken(Tokenizer tokenizer) {
|
|
int startIndex = tokenizer->byteIndex;
|
|
uint32_t ch = utf8CharAt(tokenizer);
|
|
Token result;
|
|
memset(&result, 0, sizeof(result));
|
|
if (ch == '\0') {
|
|
result.toktype = TOK_EOF;
|
|
result.toktext = NULL;
|
|
return result;
|
|
} else if (ch == '*') {
|
|
utf8Advance(tokenizer);
|
|
result.toktype = TOK_STAR;
|
|
result.toktext = NULL;
|
|
return result;
|
|
} else if (greekChar(ch)) {
|
|
while ((ch != 0) &&
|
|
(greekChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.'))) {
|
|
utf8Advance(tokenizer);
|
|
ch = utf8CharAt(tokenizer);
|
|
}
|
|
result.toktype = TOK_GREEK;
|
|
result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
|
|
return result;
|
|
} else if ((ch == '^') && (tokenizer->txt[tokenizer->byteIndex + 1] == '{')) {
|
|
int idStart = 0, idEnd = 0;
|
|
utf8Advance(tokenizer); /* to { */
|
|
utf8Advance(tokenizer); /* to id */
|
|
ch = utf8CharAt(tokenizer);
|
|
idStart = tokenizer->byteIndex;
|
|
while ((ch != 0) && (ch != '}')) {
|
|
utf8Advance(tokenizer);
|
|
ch = utf8CharAt(tokenizer);
|
|
}
|
|
idEnd = tokenizer->byteIndex;
|
|
/* reached end-of-string or } */
|
|
if (ch == '}')
|
|
utf8Advance(tokenizer);
|
|
result.toktype = TOK_REF;
|
|
result.toktext = strndup(tokenizer->txt + idStart, idEnd - idStart);
|
|
return result;
|
|
} else if (latinChar(ch)) {
|
|
while ((ch != 0) && latinChar(ch) && (ch != '*')) {
|
|
utf8Advance(tokenizer);
|
|
ch = utf8CharAt(tokenizer);
|
|
if (ch == '^') {
|
|
if (tokenizer->txt[tokenizer->byteIndex + 1] == '{') break;
|
|
}
|
|
}
|
|
result.toktype = TOK_TEXT;
|
|
result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
|
|
return result;
|
|
} else {
|
|
while ((ch != 0) && (!latinChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.') || extendedPunctuation(ch))) {
|
|
utf8Advance(tokenizer);
|
|
ch = utf8CharAt(tokenizer);
|
|
}
|
|
result.toktype = TOK_UNICODE;
|
|
result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
DEFINE_STACK(FormatElementQueue, FEQueueStack);
|
|
|
|
int formatText(const char* txt, FormatElement** dst, CitationRecordQueue* citationQPtr) {
|
|
Tokenizer tokenizer = initializeTokenizer(txt);
|
|
Token tok;
|
|
int listLength = 0, em = 0;
|
|
NEW_QUEUE(FormatElementQueue, formatElementQ);
|
|
NEW_STACK(FEQueueStack, st);
|
|
while ((tok = nextToken(tokenizer)).toktype != TOK_EOF) {
|
|
if (tok.toktype == TOK_STAR) {
|
|
if (em) { /* end emphasis */
|
|
FormatElement* content = NULL;
|
|
FormatElement elt = { .elementType = FORMAT_EM };
|
|
QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
|
|
elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
|
|
elt.elementContent.nestedContent = content;
|
|
formatElementQ = STACK_HEAD(st);
|
|
APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
|
|
POP_STACK(FEQueueStack, st);
|
|
em = 0;
|
|
} else { /* begin emphasis */
|
|
PUSH_STACK(FEQueueStack, st, formatElementQ);
|
|
REINIT_QUEUE(formatElementQ);
|
|
em = 1;
|
|
}
|
|
} else {
|
|
FormatElementType t;
|
|
FormatElement elt = { .elementContent = { .textContent = tok.toktext } } ;
|
|
if (tok.toktype == TOK_TEXT) {
|
|
t = FORMAT_TEXT;
|
|
} else if (tok.toktype == TOK_GREEK) {
|
|
t = FORMAT_GREEK;
|
|
} else if (tok.toktype == TOK_UNICODE) {
|
|
t = FORMAT_UNICODE;
|
|
} else if (tok.toktype == TOK_REF) {
|
|
t = FORMAT_CITATION;
|
|
if (citationQPtr && !lookupCitation(*citationQPtr, tok.toktext)) {
|
|
addCitation(citationQPtr, tok.toktext);
|
|
}
|
|
}
|
|
elt.elementType = t;
|
|
APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
|
|
}
|
|
}
|
|
if (em) {
|
|
/* unmatched star -- close the emphasis here */
|
|
FormatElement* content = NULL;
|
|
FormatElement elt = { .elementType = FORMAT_EM };
|
|
QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
|
|
elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
|
|
elt.elementContent.nestedContent = content;
|
|
formatElementQ = STACK_HEAD(st);
|
|
APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
|
|
POP_STACK(FEQueueStack, st);
|
|
}
|
|
|
|
QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, *dst);
|
|
listLength = QUEUE_LENGTH(formatElementQ);
|
|
DESTROY_QUEUE(FormatElementQueue, formatElementQ);
|
|
freeTokenizer(tokenizer);
|
|
return listLength;
|
|
}
|
|
|
|
void freeFormatElementArray(FormatElement* a, int length) {
|
|
int i = 0;
|
|
for (i = 0; i < length; i++) {
|
|
if (a[i].elementType == FORMAT_EM) {
|
|
freeFormatElementArray(a[i].elementContent.nestedContent,
|
|
a[i].elementContentLength);
|
|
} else {
|
|
free(a[i].elementContent.textContent);
|
|
}
|
|
}
|
|
free(a);
|
|
}
|
|
|