// Program: striphtm.c // Purpose: remove tags from .htm and .html files // Author: Scott Wenger, 2000 // Box 802, Stevens Point, WI 54481 // panther@wctc.net // http://www.private-files.com/sware.html // The program accepts the html (or htm) file name on the command line. // After removing the tags, the results are stored in file "clean.txt" // This source code is profusely commented. // The malloc function is used to store input file in memory. // Use whatever include file your compliler requires for malloc. #include #include // function prototype void striphtm(char *buffer); void striphtm(char *buffer) { static int intag = 0; // flag is 1 if inside html tag char *source, *dest; // declare two pointers to char source = dest = buffer; // point to start of buffer while (*source != '\0') { // while html in memory if (intag) { // if inside an html tag if (*source == '>') intag = 0; // if end of tag, turn off flag source++; // advance ptr to ignore tag contents } else { if (*source == '<') intag = 1; // if start of tag, set flag to 1 else *dest++ = *source; // else keep text outside tag source++; // point to next byte in memory } } // continue processing html file *dest = '\0'; // null-terminate the results } main() { FILE *fp, *ofp; long flen = 0L; int ch; char *mem, *start; char fname[500]; printf("\nstriphtm (c) 2000 Scott Wenger"); printf("\nremoves tags from html files.\n"); printf("\nName of html file: "); fflush(stdout); scanf("%s", fname); fp = fopen(fname, "r"); if ((fp = fopen(fname, "r")) == NULL) { fprintf(stderr, "\nFile not found !"); fprintf(stderr, "\nCheck spelling and add file path, if necessary.\n\n"); fflush(stdout); exit(1); } if ((ofp = fopen("clean.txt", "w")) == NULL) { fprintf(stderr, "\nUnable to open output file for writing.\n\n"); fflush(stdout); fclose(fp); exit(1); } while (fgetc(fp) != EOF) flen++; // primitive yet portable file length rewind(fp); if ( (mem = (char *) malloc(flen + 1L)) == NULL) { fprintf(stderr, "\nUnable to allocate required memory.\n\n"); fclose(fp); fclose(ofp); exit(1); } start = mem; // start points to beginning of memory while ( (*mem = fgetc(fp)) != EOF) mem++; // put htm file into memory mem = start; striphtm(mem); // remove the tags while (ch = *mem++) fputc(ch, ofp); // write to output file printf("Done. The results were written to file clean.txt\n\n"); fclose(fp); fclose(ofp); free(mem); exit(0); }