// Extended string

#define _GNU_SOURCE

#include "xstring.h"

#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <bestiola/base64.h>

#include "core.h"

typedef struct {
	const char *ac;
	char c;
} accent_replacement;

typedef struct {
	const char *upper;
	const char *lower;
} lower_case_replacement;

static accent_replacement ac_replacements[] = {
	{ "à", 'a' }, { "á", 'a' }, { "â", 'a' }, { "ã", 'a' }, { "ä", 'a' },
	{ "å", 'a' },
	{ "è", 'e' }, { "é", 'e' }, { "ê", 'e' }, { "ë", 'e' },
	{ "ì", 'i' }, { "í", 'i' }, { "î", 'i' }, { "ï", 'i' },
	{ "ò", 'o' }, { "ó", 'o' }, { "ô", 'o' }, { "õ", 'o' }, { "ö", 'o' },
	{ "ù", 'u' }, { "ú", 'u' }, { "û", 'u' }, { "ü", 'u' },
	{ "ý", 'y' }, { "ÿ", 'y' },
	{ "ā", 'a' }, { "ă", 'a' },
	{ "ē", 'e' },
	{ "ī", 'i' },
	{ "ō", 'o' },
	{ "ū", 'u' }, { "ů", 'u' }
};

static lower_case_replacement lc_replacements[] = {
	{ "À", "à" }, { "Á", "á" }, { "Â", "â" }, { "Ã", "ã" }, { "Ä", "ä" },
	{ "Å", "å" },
	{ "Ç", "ç" },
	{ "È", "è" }, { "É", "é" }, { "Ê", "ê" }, { "Ë", "ë" },
	{ "Ì", "ì" }, { "Í", "í" }, { "Î", "î" }, { "Ï", "ï" },
	{ "Ñ", "ñ" },
	{ "Ò", "ò" }, { "Ó", "ó" }, { "Ô", "ô" }, { "Õ", "õ" }, { "Ö", "ö" },
	{ "Ù", "ù" }, { "Ú", "ú" }, { "Û", "û" }, { "Ü", "ü" },
	{ "Ý", "ý" },
	{ "Ā", "ā" }, { "Ă", "ă" },
	{ "Ē", "ē" },
	{ "Ī", "ī" },
	{ "Ō", "ō" },
	{ "Ū", "ū" }, { "Ů", "ů" },
	{ "Ÿ", "ÿ" }
};

static size_t max(size_t a, size_t b);
static const char *xml_replacement(char c);

static int compare_ac_replacements(const void *p1, const void *p2) {
	const accent_replacement *r1 = p1;
	const accent_replacement *r2 = p2;
	size_t n1 = r1->c ? strlen(r1->ac) : 0;
	size_t n2 = r2->c ? strlen(r2->ac) : 0;
	return strncmp(r1->ac, r2->ac, max(n1, n2));
}

static int compare_lc_replacements(const void *p1, const void *p2) {
	const lower_case_replacement *r1 = p1;
	const lower_case_replacement *r2 = p2;
	size_t n1 = r1->lower ? strlen(r1->upper) : 0;
	size_t n2 = r2->lower ? strlen(r2->upper) : 0;
	return strncmp(r1->upper, r2->upper, max(n1, n2));
}

char hexchar(char value) {
	value &= 0x0f;
	if (value < 10)
		return '0' + value;
	return 'a' + value - 10;
}

static _Bool is_escape_db(char c) {
	switch (c) {
	case '\'':
	case '\\':
		return 1;
	}
	return 0;
}

static _Bool is_url_safe(char c) {
	if (!isascii(c))
		return 0;
	if (isalnum(c))
		return 1;
	switch (c) {
	case '!':
	case '$':
	case '\'':
	case '(':
	case ')':
	case '*':
	case '+':
	case ',':
	case '-':
	case '.':
	case '_':
		return 1;
	}
	return 0;
}

static size_t max(size_t a, size_t b) {
	return a > b ? a : b;
}

moix_mem *moix_mem_decode_base64(const char *s, size_t *size) {
	size_t sz;
	int bchar;
	const char *c;
	for (sz = 0, bchar = 0, c = s; *c; c++) {
		if (is_base64(*c)) {
			if (++bchar == 4) {
				bchar = 0;
				sz += 3;
			}
		}
		else if (*c == '=') {
			if (bchar < 2)
				return NULL;
			unsigned char remainder = base64_value(c[-1])
				<< 2 * bchar;
			if (remainder)
				return NULL;
			sz += bchar - 1;
// check full valid?
			break;
		}
		else
			return NULL;
	}
	if (!size)
		sz++;
	char *d = moix_alloc(sz);
	moix_mem *decoded = moix_alloc_memory(d);
	for (bchar = 0, c = s; is_base64(*c); ) {
		unsigned char value = base64_value(*c++);
		switch (bchar++) {
		case 0:
			*d = value << 2;
			break;
		case 1:
			*d++ |= value >> 4;
			*d = value << 4;
			break;
		case 2:
			*d++ |= value >> 2;
			*d = value << 6;
			break;
		case 3:
			*d++ |= value;
			bchar = 0;
		}
	}
	if (size)
		*size = sz;
	else
		*d = '\0';
	return decoded;
}

moix_mem *moix_xstring_decode_base64(const char *s) {
	return moix_mem_decode_base64(s, NULL);
}

//
// Retorna la cadena en format Normalitzat.
//

moix_mem *moix_xstring_normalize(const char *s) {
	while (isspace(*s))
		s++;

	size_t length = strlen(s);
	while (length && isspace(s[length - 1]))
		length--;

	const char *sc = s;
	char *nc_start = moix_alloc(length + 1);
	moix_mem *norm = moix_alloc_memory(nc_start);
	char *nc = nc_start;
	size_t i;
	for (i = 0; i < length; i++) {
		char c = *sc++;
		if (isspace(c) && nc[-1] == ' ')
			continue;
		*nc++ = isspace(c) ? ' ' : c;
	}
	*nc++ = '\0';
	moix_realloc(norm, nc - nc_start);

	return norm;
}

//
// Retorna la cadena en format Base de dades.
//

moix_mem *moix_xstring_to_db(const char *s) {
	size_t size;
	const char *c;
	for (size = 1, c = s; *c; size++, c++)
		if (is_escape_db(*c))
			size++;
	char *e = moix_alloc(size + 3);
	moix_mem *escaped = moix_alloc_memory(e);
	*e++ = 'E';
	*e++ = '\'';
	c = s;
	while (*c) {
		if (is_escape_db(*c))
			*e++ = *c;
		*e++ = *c++;
	}
	*e++ = '\'';
	*e = '\0';
	return escaped;
}

//
// Retorna la cadena en format XML.
//

moix_mem *moix_xstring_to_xml(const char *s) {
	size_t size = 1;
	const char *c;
	for (c = s; *c; c++) {
		const char *replacement = xml_replacement(*c);
		size += replacement ? strlen(replacement) : 1;
	}
	char *x = moix_alloc(size);
	moix_mem *xml = moix_alloc_memory(x);
	for (c = s; *c; c++) {
		const char *replacement = xml_replacement(*c);
		if (replacement)
			x = mempcpy(x, replacement, strlen(replacement));
		else
			*x++ = *c;
	}
	*x = '\0';
	return xml;
}

//
// Retorna la cadena en minúscules.
//

moix_mem *to_lower_case(const char *s) {
	char *l = moix_alloc(strlen(s) + 1);
	moix_mem *lower = moix_alloc_memory(l);
	while (*s) {
		if (!(*s & 0x80)) {
			*l++ = tolower(*s++);
			continue;
		}

		lower_case_replacement key;
		key.upper = s;
		key.lower = NULL;

		lower_case_replacement *replacement = bsearch(&key,
			lc_replacements,
			sizeof(lc_replacements) / sizeof(lc_replacements[0]),
			sizeof(lc_replacements[0]), compare_lc_replacements);
		if (replacement) {
			const char *lower = replacement->lower;
			size_t length = strlen(lower);
			l = mempcpy(l, lower, length);
			s += length;
			continue;
		}

		while (*s & 0x80)
			*l++ = *s++;
	}
	*l = '\0';
	return lower;
}

//
// Retorna la cadena en format Recerca.
//

moix_mem *to_search(const char *s, _Bool is_query) {
	char *np_start = moix_alloc(strlen(s) + 1);
	moix_mem *no_punct = moix_alloc_memory(np_start);
	char *np = np_start;
	while (*s) {
		char c = *s++;
		*np++ = ispunct(c) && !(is_query && c == '"') ? ' ' : c;
	}
	*np = '\0';

	moix_mem *norm = moix_xstring_normalize(np_start);
	moix_free(no_punct);
	moix_mem *lower = to_lower_case(moix_string(norm));
	moix_free(norm);

	s = moix_string(lower);
	char *sr_start = moix_alloc(strlen(s) + 1);
	moix_mem *search = moix_alloc_memory(sr_start);
	char *sr = sr_start;
	while (*s) {
		if (*s == '"') {
			if (sr > sr_start && sr[-1] == ' ')
				sr[-1] = '"';
			else
				*sr++ = '"';
			s++;
			if (*s == ' ')
				s++;
			continue;
		}

		if (!(*s & 0x80)) {
			*sr++ = *s++;
			continue;
		}

		accent_replacement key;
		key.ac = s;
		key.c = '\0';

		accent_replacement *replacement = bsearch(&key, ac_replacements,
			sizeof(ac_replacements) / sizeof(ac_replacements[0]),
			sizeof(ac_replacements[0]), compare_ac_replacements);
		if (replacement) {
			*sr++ = replacement->c;
			s += strlen(replacement->ac);
			continue;
		}

		while (*s & 0x80)
			*sr++ = *s++;
	}
	*sr++ = '\0';
	moix_realloc(search, sr - sr_start);
	return search;
}

moix_mem *url_escape(const char *s) {
	size_t size = 1;
	const char *c;
	for (c = s; *c; c++)
		size += is_url_safe(*c) ? 1 : 3;
	char *e = moix_alloc(size);
	moix_mem *escaped = moix_alloc_memory(e);
	for (c = s; *c; c++) {
		if (is_url_safe(*c))
			*e++ = *c;
		else {
			*e++ = '%';
			*e++ = hexchar(*c >> 4);
			*e++ = hexchar(*c);
		}
	}
	*e = '\0';
	return escaped;
}

static const char *xml_replacement(char c) {
	switch (c) {
	case '"':
		return "&quot;";
	case '&':
		return "&amp;";
	case '\'':
		return "&apos;";
	case '<':
		return "&lt;";
	case '>':
		return "&gt;";
	case '\\':
		return "&#92;";
	default:
		return NULL;
	}
}
