diff --git a/src/format.c b/src/format.c index 14306d9..33a8208 100644 --- a/src/format.c +++ b/src/format.c @@ -44,6 +44,7 @@ typedef enum { TOK_UNICODE, TOK_STAR, TOK_REF, + TOK_URL, /* TOK_DASH, TOK_OPEN_DOUBLE_QUOTE, @@ -84,6 +85,21 @@ latinChar(uint32_t ch) { return (ch <= 0xff) || extendedPunctuation(ch); } +inline int +httpAt(Tokenizer tokenizer) { + return ((tolower(tokenizer->txt[tokenizer->byteIndex]) == 'h') && + (tolower(tokenizer->txt[tokenizer->byteIndex + 1]) == 't') && + (tolower(tokenizer->txt[tokenizer->byteIndex + 2]) == 't') && + (tolower(tokenizer->txt[tokenizer->byteIndex + 3]) == 'p') && + ((tokenizer->txt[tokenizer->byteIndex + 4] == ':') && + (tokenizer->txt[tokenizer->byteIndex + 5] == '/') && + (tokenizer->txt[tokenizer->byteIndex + 6] == '/')) || + ((tolower(tokenizer->txt[tokenizer->byteIndex + 4]) == 's') && + (tokenizer->txt[tokenizer->byteIndex + 5] == ':') && + (tokenizer->txt[tokenizer->byteIndex + 6] == '/') && + (tokenizer->txt[tokenizer->byteIndex + 7] == '/'))); +} + static Token nextToken(Tokenizer tokenizer) { int startIndex = tokenizer->byteIndex; @@ -125,12 +141,30 @@ nextToken(Tokenizer tokenizer) { result.toktype = TOK_REF; result.toktext = strndup(tokenizer->txt + idStart, idEnd - idStart); return result; + } else if (httpAt(tokenizer)) { + int endIndex = 0; + while ((ch != 0) && (ch != ' ') && (ch != '\r') && (ch != '\n')) { + utf8Advance(tokenizer); + ch = utf8CharAt(tokenizer); + } + if (tokenizer->txt[tokenizer->byteIndex - 1] == '.') { + /* heuristic: url doesn't end in . */ + endIndex = --tokenizer->byteIndex; + } else { + endIndex = tokenizer->byteIndex; + } + + result.toktype = TOK_URL; + result.toktext = strndup(tokenizer->txt + startIndex, endIndex - startIndex); + return result; } else if (latinChar(ch)) { while ((ch != 0) && latinChar(ch) && (ch != '*')) { utf8Advance(tokenizer); ch = utf8CharAt(tokenizer); if (ch == '^') { if (tokenizer->txt[tokenizer->byteIndex + 1] == '{') break; + } else if (httpAt(tokenizer)) { + break; } } result.toktype = TOK_TEXT; @@ -181,6 +215,8 @@ int formatText(const char* txt, FormatElement** dst, CitationRecordQueue* citati t = FORMAT_GREEK; } else if (tok.toktype == TOK_UNICODE) { t = FORMAT_UNICODE; + } else if (tok.toktype == TOK_URL) { + t = FORMAT_URL; } else if (tok.toktype == TOK_REF) { t = FORMAT_CITATION; if (citationQPtr && !lookupCitation(*citationQPtr, tok.toktext)) { diff --git a/src/format.h b/src/format.h index c57b8bc..ac48e96 100644 --- a/src/format.h +++ b/src/format.h @@ -37,7 +37,8 @@ typedef enum { FORMAT_STRONG, FORMAT_CITATION, FORMAT_GREEK, - FORMAT_UNICODE + FORMAT_UNICODE, + FORMAT_URL } FormatElementType; typedef struct FormatElement FormatElement; diff --git a/src/xml.c b/src/xml.c index 9455fd0..6184052 100644 --- a/src/xml.c +++ b/src/xml.c @@ -93,6 +93,11 @@ formatElementsToXML( xmlNodePtr unicode = xmlNewNode(sermon_ns, "unicode"); xmlAddChild(unicode, xmlNewText(a[i].elementContent.textContent)); xmlAddChild(parentElement, unicode); + } else if (a[i].elementType == FORMAT_URL) { + xmlNodePtr link = xmlNewNode(sermon_ns, "link"); + xmlSetProp(link, "href", a[i].elementContent.textContent); + xmlAddChild(link, xmlNewText(a[i].elementContent.textContent)); + xmlAddChild(parentElement, link); } else if (a[i].elementType == FORMAT_CITATION) { xmlNodePtr cite = xmlNewNode(sermon_ns, "cite"); int num = findReferenceNumber(numReferences, sermonReferencesPtr, a[i].elementContent.textContent);