-- Leo's gemini proxy

-- Connecting to git.thebackupbox.net:1965...

-- Connected

-- Sending request

-- Meta line: 20 text/gemini

repo: html_entities_decode
action: blob
revision:
path_from: html_entities_decode.c
revision_from: refs/heads/master:
path_to:
revision_to:

git.thebackupbox.net

html_entities_decode

git://git.thebackupbox.net/html_entities_decode

blob of:

html_entities_decode

/ html_entities_decode.c

refs/heads/master:/html_entities_decode.c
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>//write()
 #include <stdlib.h>//strtol()
 #ifdef GPERF
 #include "entities_gperf.h"
 char *get_entity(char *name) {
   struct entity *e=in_word_set(name,strlen(name));
   if(e) return e->value;
   else return 0;
 }
 #else
 #include "entities.h"
 char *get_entity(char *name) {
   int i;
   for(i=0;entities[i];i+=2) {
     if(!strcmp(name,entities[i])) {
       return entities[i+1];
     }
   }
   return 0;
 }
 #endif
 // https://www.w3.org/MarkUp/html-spec/html-spec_3.html#SEC3.2.3
 #define NAMELEN 72

 void print_entity(char *name,int len) {
   int i;
   unsigned int c;//we can store one unicode point in here.
   int l;
   char *t;
   char b[10];//dunno
   name[len-1]=0;//fuck it. we'll null out the ; and we can play with this string as a C string.
   if(*name != '&') printf("how in the hell did this happen?!?\n");
   t=get_entity(name+1); //skip the leading &
   if(t) name=t;
   if(!strncasecmp(name,"&#x",3)) {
     c=strtol(name+3,0,16);
     //we have some hex here. need to convert to decimal.
   }
   else if(!strncasecmp(name,"&#",2)) {//we have normal decimal
     c=strtol(name+2,0,10);
   } else {
    //I don't know wtf this is...
    name[len-1]=';';
    write(1,name,len);
    return;
   }
   if(c < 0x80) { b[0]=c; l=1; }
   else if(c < 0x800) { b[0]= 192 + c / 64; b[1]=128 + c % 64; l=2; }
   else if(c < 0x10000) { b[0]= 224 + c / 4096;b[1]= 128+c/64%64;b[2]=128+c%64;l=3; }
   else if(c < 0x110000) { b[0]= 240+c/262144 ;b[1]=128+c/4096%64;b[2]=128+c/64%64;b[3]=128+c%64;l=4; }
   //we have a decimal value for the character. now to print it.
   write(1,&b,l);
 }

 int main(int argc,char *argv[]) {
   unsigned char buffer[NAMELEN+3];//+3 just because I'm too lazy to know exactly how many bytes I'll need.
   char in_entity_name=0;
   int i=0;
   short in;
   for(;(in=fgetc(stdin)) != -1;) {//this loop needs to be fixed to read larger amounts of data so it'll go faster
    buffer[i]=in;
    i++;
    if(in_entity_name && i < NAMELEN &&
      (
       (i > 1 && in == ';') || //if we have an empty entitity... fuck this shit.
       (i > 1 && (in >= 'a' && in <= 'z') || (in >= 'A' && in <= 'Z') || (in >= '0' && in <='9') || (in == '-') || (in == '.') ) ||
       (i == 1 && (in >= 'a' && in <= 'z') || (in >= 'A' && in <= 'Z') || (in == '#'))
      )
      ) {
     if(in == ';') {
      print_entity(buffer,i);
      in_entity_name=0;
      i=0;
     }
    }
    else {//if we're not in an entity name just flush shit out.
     if(in == '&') {
      in_entity_name=1;
     } else {
      write(1,buffer,i);
      in_entity_name=0;
      i=0;
     }
    }
   }
   return 0;
 }

-- Response ended

-- Page fetched on Sun Jun 2 12:29:23 2024