-- Leo's gemini proxy
-- Connecting to git.thebackupbox.net:1965...
-- Connected
-- Sending request
-- Meta line: 20 text/gemini
repo: html_entities_decode action: blob revision: path_from: html_entities_decode.c revision_from: refs/heads/master: path_to: revision_to:
/ html_entities_decode.c
refs/heads/master:/html_entities_decode.c #include <stdio.h> #include <string.h> #include <unistd.h>//write() #include <stdlib.h>//strtol() #ifdef GPERF #include "entities_gperf.h" char *get_entity(char *name) { struct entity *e=in_word_set(name,strlen(name)); if(e) return e->value; else return 0; } #else #include "entities.h" char *get_entity(char *name) { int i; for(i=0;entities[i];i+=2) { if(!strcmp(name,entities[i])) { return entities[i+1]; } } return 0; } #endif // https://www.w3.org/MarkUp/html-spec/html-spec_3.html#SEC3.2.3 #define NAMELEN 72 void print_entity(char *name,int len) { int i; unsigned int c;//we can store one unicode point in here. int l; char *t; char b[10];//dunno name[len-1]=0;//fuck it. we'll null out the ; and we can play with this string as a C string. if(*name != '&') printf("how in the hell did this happen?!?\n"); t=get_entity(name+1); //skip the leading & if(t) name=t; if(!strncasecmp(name,"&#x",3)) { c=strtol(name+3,0,16); //we have some hex here. need to convert to decimal. } else if(!strncasecmp(name,"&#",2)) {//we have normal decimal c=strtol(name+2,0,10); } else { //I don't know wtf this is... name[len-1]=';'; write(1,name,len); return; } if(c < 0x80) { b[0]=c; l=1; } else if(c < 0x800) { b[0]= 192 + c / 64; b[1]=128 + c % 64; l=2; } else if(c < 0x10000) { b[0]= 224 + c / 4096;b[1]= 128+c/64%64;b[2]=128+c%64;l=3; } else if(c < 0x110000) { b[0]= 240+c/262144 ;b[1]=128+c/4096%64;b[2]=128+c/64%64;b[3]=128+c%64;l=4; } //we have a decimal value for the character. now to print it. write(1,&b,l); } int main(int argc,char *argv[]) { unsigned char buffer[NAMELEN+3];//+3 just because I'm too lazy to know exactly how many bytes I'll need. char in_entity_name=0; int i=0; short in; for(;(in=fgetc(stdin)) != -1;) {//this loop needs to be fixed to read larger amounts of data so it'll go faster buffer[i]=in; i++; if(in_entity_name && i < NAMELEN && ( (i > 1 && in == ';') || //if we have an empty entitity... fuck this shit. (i > 1 && (in >= 'a' && in <= 'z') || (in >= 'A' && in <= 'Z') || (in >= '0' && in <='9') || (in == '-') || (in == '.') ) || (i == 1 && (in >= 'a' && in <= 'z') || (in >= 'A' && in <= 'Z') || (in == '#')) ) ) { if(in == ';') { print_entity(buffer,i); in_entity_name=0; i=0; } } else {//if we're not in an entity name just flush shit out. if(in == '&') { in_entity_name=1; } else { write(1,buffer,i); in_entity_name=0; i=0; } } } return 0; }
-- Response ended
-- Page fetched on Sun Jun 2 12:29:23 2024