'------------------------------------------------------------------------------- ' XTAG.BAS ' Written by Gary Peek, Industrologic, Inc. www.industrologic.com ' ' See the user help text for general description of the program. ' ' The general method is a 2 step process: ' 1. Read the input file into a string (byte) array ' A. Retain, remove or substitute spaces for carrige returns, line feeds, and ' tabs if desired, which will piece back strings that were separated by ' exceeding the line length limit. ' 2. Go through the array to build tags ' A. Retain, simplify, or remove tags one at a time ' B. Add carriage return/line feed sequences after ending tags and some ' other tags, to make it readable, leaving long lines of text still long ' ' Problems and limitations: ' ' The simple method used does not allow nested tags to be removed that have ' matching ending tags. For example in the following line, the FONT SIZE ' cannot be removed, but _both_ FONT tags can be. ' sample text ' ' HTML generators often use FONT SIZE where headings would be more desirable ' so removing all font tags will remove headings. ' ' Possible future enhancements: ' A third step process nested tags. ' An option to specify maximun line length ' '------------------------------------------------------------------------------- #DIM ALL #COMPILE EXE #OPTION VERSION4 #INCLUDE "WIN32API.INC" #INCLUDE "COMDLG32.INC" #RESOURCE "IDL.PBR" %ID_SELINFILE = 111 %ID_INFILE = 112 %ID_SELOUTFILE = 113 %ID_OUTFILE = 114 %ID_FRAME1 = 120 %ID_RETAINP = 121 %ID_REMOVEP = 122 %ID_REPLACEP = 123 %ID_FRAME4 = 130 %ID_RETAINT = 131 %ID_REMOVET = 132 %ID_REPLACET = 133 %ID_FRAME2 = 140 %ID_RETAINLF = 141 %ID_REMOVELF = 142 %ID_REPLACELF = 143 %ID_FRAME3 = 150 %ID_RETAINCR = 151 %ID_REMOVECR = 152 %ID_REPLACECR = 153 %ID_REMOVECM = 161 %ID_REMOVEFT = 162 %ID_REMOVEMT = 163 %ID_REMOVEDV = 164 %ID_REMOVEST = 165 %ID_MODIFY = 201 %ID_ABORT = 202 %ID_HELP = 203 %ID_EXIT = 204 %ID_TXTBYTECOUNT = 251 %ID_BYTECOUNT = 252 %ID_TXTCHANGES = 253 %ID_CHANGES = 254 %ID_STATUS = 301 ' Global data variables GLOBAL hMainDlg AS LONG GLOBAL hDisplayDlg AS LONG GLOBAL result AS LONG GLOBAL infile AS STRING GLOBAL outfile AS STRING GLOBAL char() AS STRING GLOBAL c AS STRING GLOBAL removesp AS LONG GLOBAL rrrpa AS LONG GLOBAL rrrcr AS LONG GLOBAL rrrlf AS LONG GLOBAL rrrtb AS LONG GLOBAL removefont AS LONG GLOBAL removecomment AS LONG GLOBAL removemeta AS LONG GLOBAL removestyle AS LONG GLOBAL removediv AS LONG GLOBAL numpa AS LONG GLOBAL numcr AS LONG GLOBAL numlf AS LONG GLOBAL numtb AS LONG GLOBAL runflag AS LONG GLOBAL bytesinfile AS DOUBLE GLOBAL bf AS LONG GLOBAL ba AS LONG GLOBAL a AS LONG GLOBAL numtags AS LONG GLOBAL tag AS STRING GLOBAL t AS STRING GLOBAL newline AS LONG GLOBAL tagflag AS LONG GLOBAL centerflag AS LONG GLOBAL fontflag AS LONG GLOBAL tempstr AS ASCIIZ * 255 GLOBAL stattxt AS STRING GLOBAL helptext AS STRING ' Callback Declarations DECLARE CALLBACK FUNCTION MainProc() DECLARE SUB removetags DECLARE SUB processtag '------------------------------------------------------------------------------ FUNCTION PBMAIN() AS LONG DIALOG FONT "MS Sans Serif", 8 DIALOG NEW 0, "Xtag, an HTML file simplification utility, version 1.1", 0, 0, 340, 220, %DS_CENTER OR %WS_CAPTION OR %WS_SYSMENU, 0 TO hMainDlg CONTROL ADD BUTTON, hMainDlg, %ID_SELINFILE, "Select input file", 10, 10, 70, 12, %WS_GROUP CONTROL ADD LABEL, hMainDlg, %ID_INFILE, "Input file name", 90, 10, 240, 12, , %WS_EX_CLIENTEDGE CONTROL SET COLOR hMainDlg, %ID_INFILE, %BLACK, %WHITE CONTROL ADD BUTTON, hMainDlg, %ID_SELOUTFILE, "Select output file" , 10, 25, 70, 12, %WS_GROUP CONTROL ADD LABEL, hMainDlg, %ID_OUTFILE, "Output file name", 90, 25, 240, 12, , %WS_EX_CLIENTEDGE CONTROL SET COLOR hMainDlg, %ID_OUTFILE, %BLACK, %WHITE CONTROL ADD FRAME, hMainDlg, %ID_FRAME1, "Paragraph Tags", 10, 45, 145, 45 CONTROL ADD OPTION, hMainDlg, %ID_RETAINP, "Retain all tags and options", 20, 55, 125, 12, %WS_GROUP CONTROL ADD OPTION, hMainDlg, %ID_REMOVEP, "Remove all tags", 20, 65, 125, 12 CONTROL ADD OPTION, hMainDlg, %ID_REPLACEP, "Replace centering with center tags", 20, 75, 125, 12 CONTROL ADD FRAME, hMainDlg, %ID_FRAME2, "Tabs", 10, 95, 100, 45 CONTROL ADD OPTION, hMainDlg, %ID_RETAINT, "Retain", 20, 105, 80, 12, %WS_GROUP CONTROL ADD OPTION, hMainDlg, %ID_REMOVET, "Remove", 20, 115, 80, 12 CONTROL ADD OPTION, hMainDlg, %ID_REPLACET, "Replace with spaces", 20, 125, 80, 12 CONTROL ADD FRAME, hMainDlg, %ID_FRAME1, "Carriage Returns", 120, 95, 100, 45 CONTROL ADD OPTION, hMainDlg, %ID_RETAINCR, "Retain", 130, 105, 80, 12, %WS_GROUP CONTROL ADD OPTION, hMainDlg, %ID_REMOVECR, "Remove", 130, 115, 80, 12 CONTROL ADD OPTION, hMainDlg, %ID_REPLACECR, "Replace with spaces", 130, 125, 80, 12 CONTROL ADD FRAME, hMainDlg, %ID_FRAME2, "Line Feeds", 230, 95, 100, 45 CONTROL ADD OPTION, hMainDlg, %ID_RETAINLF, "Retain", 240, 105, 80, 12, %WS_GROUP CONTROL ADD OPTION, hMainDlg, %ID_REMOVELF, "Remove", 240, 115, 80, 12 CONTROL ADD OPTION, hMainDlg, %ID_REPLACELF, "Replace with spaces", 240, 125, 80, 12 CONTROL ADD CHECKBOX, hMainDlg, %ID_REMOVEMT, "Remove META tags", 165, 48, 80, 12, %WS_GROUP CONTROL ADD CHECKBOX, hMainDlg, %ID_REMOVECM, "Remove comments", 255, 48, 80, 12, %WS_GROUP CONTROL ADD CHECKBOX, hMainDlg, %ID_REMOVEST, "Remove STYLE tags", 165, 60, 80, 12, %WS_GROUP CONTROL ADD CHECKBOX, hMainDlg, %ID_REMOVEDV, "Remove DIV tags", 255, 60, 80, 12, %WS_GROUP CONTROL ADD CHECKBOX, hMainDlg, %ID_REMOVEFT, "Remove FONT tags", 165, 73, 80, 12, %WS_GROUP CONTROL ADD BUTTON, hMainDlg, %ID_MODIFY, "Modify", 10, 150, 60, 12, %WS_GROUP CONTROL ADD BUTTON, hMainDlg, %ID_ABORT, "Abort", 95, 150, 60, 12, %WS_GROUP CONTROL ADD BUTTON, hMainDlg, %ID_HELP, "Help", 180, 150, 60, 12, %WS_GROUP CONTROL ADD BUTTON, hMainDlg, %ID_EXIT, "Exit", 265, 150, 60, 12, %WS_GROUP CONTROL ADD TEXTBOX, hMainDlg, %ID_STATUS, "", 10, 170, 320, 40, %WS_BORDER OR %WS_VSCROLL OR %ES_MULTILINE, %WS_EX_CLIENTEDGE DIALOG SHOW MODAL hMainDlg, CALL MainProc END FUNCTION '------------------------------------------------------------------------------ CALLBACK FUNCTION MainProc() ' LOCAL hIcon AS LONG SELECT CASE CBMSG CASE %WM_INITDIALOG ' uncomment the 2 lines below and the variable above if you want to add an icon ' hIcon = LoadIcon(GetModuleHandle(BYVAL %NULL), "PROGRAM") ' get handle to icon ' DIALOG SEND CBHNDL, %WM_SETICON, %ICON_BIG, hIcon 'send message to dialog to set icon outfile = "temp.htm" CONTROL SET OPTION hMainDlg, %ID_RETAINP, %ID_RETAINP, %ID_REPLACEP CONTROL SET OPTION hMainDlg, %ID_REPLACET, %ID_RETAINT, %ID_REPLACET CONTROL SET OPTION hMainDlg, %ID_REMOVECR, %ID_RETAINCR, %ID_REPLACECR CONTROL SET OPTION hMainDlg, %ID_REPLACELF, %ID_RETAINLF, %ID_REPLACELF CONTROL SET CHECK hMainDlg, %ID_REMOVEMT, 1 CONTROL SET CHECK hMainDlg, %ID_REMOVECM, 1 CONTROL SET CHECK hMainDlg, %ID_REMOVEST, 1 CONTROL SET CHECK hMainDlg, %ID_REMOVEDV, 1 CONTROL SET CHECK hMainDlg, %ID_REMOVEFT, 1 CASE %WM_COMMAND SELECT CASE CBCTL CASE %ID_SELINFILE OpenFileDialog 0, "Input File", infile, "", _ "All Files|*.*", "", %OFN_FILEMUSTEXIST OR %OFN_HIDEREADONLY OR %OFN_LONGNAMES CONTROL SET TEXT hMainDlg, %ID_INFILE, infile CASE %ID_SELOUTFILE SaveFileDialog 0, "Output File", outfile, "", _ "All Files|*.*", "", %OFN_OVERWRITEPROMPT OR %OFN_HIDEREADONLY OR %OFN_LONGNAMES CONTROL SET TEXT hMainDlg, %ID_OUTFILE, outfile IF RIGHT$(outfile, 11) = "temp.tmp" THEN outfile = "temp.htm" MSGBOX "temp.tmp cannot be used as an output file name" END IF CASE %ID_MODIFY IF infile = "" THEN OpenFileDialog 0, "Input File", infile, "", _ "All Files|*.*", "", %OFN_FILEMUSTEXIST OR %OFN_HIDEREADONLY OR %OFN_LONGNAMES CONTROL SET TEXT hMainDlg, %ID_INFILE, infile END IF CONTROL SET TEXT hMainDlg, %ID_OUTFILE, outfile CONTROL GET CHECK hMainDlg, %ID_RETAINP TO result IF result = 1 THEN rrrpa = 1 CONTROL GET CHECK hMainDlg, %ID_REMOVEP TO result IF result = 1 THEN rrrpa = 2 CONTROL GET CHECK hMainDlg, %ID_REPLACEP TO result IF result = 1 THEN rrrpa = 3 CONTROL GET CHECK hMainDlg, %ID_RETAINT TO result IF result = 1 THEN rrrtb = 1 CONTROL GET CHECK hMainDlg, %ID_REMOVET TO result IF result = 1 THEN rrrtb = 2 CONTROL GET CHECK hMainDlg, %ID_REPLACET TO result IF result = 1 THEN rrrtb = 3 CONTROL GET CHECK hMainDlg, %ID_RETAINCR TO result IF result = 1 THEN rrrcr = 1 CONTROL GET CHECK hMainDlg, %ID_REMOVECR TO result IF result = 1 THEN rrrcr = 2 CONTROL GET CHECK hMainDlg, %ID_REPLACECR TO result IF result = 1 THEN rrrcr = 3 CONTROL GET CHECK hMainDlg, %ID_RETAINLF TO result IF result = 1 THEN rrrlf = 1 CONTROL GET CHECK hMainDlg, %ID_REMOVELF TO result IF result = 1 THEN rrrlf = 2 CONTROL GET CHECK hMainDlg, %ID_REPLACELF TO result IF result = 1 THEN rrrlf = 3 CONTROL GET CHECK hMainDlg, %ID_REMOVEMT TO result IF result = 1 THEN removemeta = 1 ELSE removemeta = 0 CONTROL GET CHECK hMainDlg, %ID_REMOVECM TO result IF result = 1 THEN removecomment = 1 ELSE removecomment = 0 CONTROL GET CHECK hMainDlg, %ID_REMOVEST TO result IF result = 1 THEN removestyle = 1 ELSE removestyle = 0 CONTROL GET CHECK hMainDlg, %ID_REMOVEDV TO result IF result = 1 THEN removediv = 1 ELSE removediv = 0 CONTROL GET CHECK hMainDlg, %ID_REMOVEFT TO result IF result = 1 THEN removefont = 1 ELSE removefont = 0 removetags ' subroutine to remove tags CASE %ID_ABORT runflag = 0 CASE %ID_EXIT runflag = 0 DIALOG END CBHNDL, 0 ' causes %IDCANCEL message CASE %ID_HELP helptext = "This program simplfies HTML files created by word processing programs like" + CHR$(13) _ + "Word, which generate a large number of tags to attempt to make the HTML" + CHR$(13) _ + "represent the appearance of the document." + CHR$(13) _ + CHR$(13) _ + "With its most aggressive tag reduction features turned on, it reduces the" + CHR$(13) _ + "HTML to a bare minimum, retaining only the content text, basic formatting" + CHR$(13) _ + "like bold, italic, underline, etc., lists, centering, and headings." + CHR$(13)_ + CHR$(13) _ + "Limitations: Cannot identify and selectively remove nested tags that include" + CHR$(13) _ + "matching ending tags." + CHR$(13) _ + CHR$(13) _ + "Xtag courtesy of Industrologic, Inc., www.industrologic.com" MSGBOX helptext, , "Xtag Help" END SELECT END SELECT END FUNCTION ' ------------------------------------------------------------------------------ SUB removetags runflag = 1 ' open file, determine length, setup array OPEN infile FOR BINARY AS #1 bytesinfile = LOF(1) DIM char(bytesinfile + 1000) ' read individual file bytes into array stattxt = stattxt + infile + $CRLF stattxt = stattxt + "Reading file into memory... " CONTROL SET TEXT hMainDlg, %ID_STATUS, stattxt CONTROL SEND hMainDlg, %ID_STATUS, %EM_LINESCROLL, 0, 100 numcr = 0 ' character counts numlf = 0 numtb = 0 ba = 0 FOR bf = 1 TO bytesinfile ' for each byte in file IF bf MOD 100 = 0 THEN DIALOG DOEVENTS ' check for exit IF runflag = 0 THEN EXIT SUB GET$ #1, 1, c ' get one character from file IF c = CHR$(13) THEN ' carriage return IF rrrcr = 1 THEN ' if retain ba = ba + 1 ' increment array pointer char(ba) = c ' store character in array ELSEIF rrrcr = 2 THEN ' if remove numcr = numcr + 1 ' increment remove/replace count ELSEIF rrrcr = 3 THEN ' if replace ba = ba + 1 ' increment array pointer char(ba) = " " ' replace with space in array numcr = numcr + 1 ' increment remove/replace count END IF ELSEIF c = CHR$(10) THEN ' line feed IF rrrlf = 1 THEN ba = ba + 1 char(ba) = c ELSEIF rrrlf = 2 THEN numlf = numlf + 1 ELSEIF rrrlf = 3 THEN ba = ba + 1 char(ba) = " " numlf = numlf + 1 END IF ELSEIF c = CHR$(9) THEN ' tab IF rrrtb = 1 THEN ba = ba + 1 char(ba) = c ELSEIF rrrtb = 2 THEN numtb = numtb + 1 ELSEIF rrrtb = 3 THEN ba = ba + 1 char(ba) = " " numtb = numtb + 1 END IF ELSE ba = ba + 1 : char(ba) = c ' leave normal characters END IF NEXT bf CLOSE #1 stattxt = stattxt + FORMAT$(bf) + " bytes read." + $CRLF ' + "CR removed/replaced : " + FORMAT$(numcr) + " LF removed/replaced : " + FORMAT$(numlf) + " Tabs removed/replaced : " + FORMAT$(numtb) + $CRLF CONTROL SET TEXT hMainDlg, %ID_STATUS, stattxt CONTROL SEND hMainDlg, %ID_STATUS, %EM_LINESCROLL, 0, 100 IF outfile <> infile THEN ' open output file OPEN outfile FOR OUTPUT AS #2 ELSE OPEN "temp.tmp" FOR OUTPUT AS #2 END IF ' process array stattxt = stattxt + "Removing tags... " CONTROL SET TEXT hMainDlg, %ID_STATUS, stattxt CONTROL SEND hMainDlg, %ID_STATUS, %EM_LINESCROLL, 0, 100 numtags = 0 newline = 0 FOR a = 1 TO ba c = char(a) ' get character from array IF a MOD 100 = 0 THEN DIALOG DOEVENTS ' check for exit IF runflag = 0 THEN EXIT SUB IF tagflag = 0 THEN ' not building a tag IF c = "<" THEN ' start of tag tagflag = 1 tag = c ' start building tag ELSE IF c = " " OR c = CHR$(9) THEN ' if space or tabs IF newline = 0 THEN ' if not first character of new line PRINT #2, c; ' pass through a regular character END IF ELSE PRINT #2, c; ' pass through a regular character newline = 0 END IF END IF ELSE ' building a tag tag = tag + c ' add character to tag IF c = ">" THEN ' tag complete tagflag = 0 t = LCASE$(tag) ' make comparisons easier numtags = numtags + 1 ' count tags processtag ' process tag just built END IF END IF NEXT a stattxt = stattxt + FORMAT$(numtags) + " tags processed." + $CRLF CONTROL SET TEXT hMainDlg, %ID_STATUS, stattxt CONTROL SEND hMainDlg, %ID_STATUS, %EM_LINESCROLL, 0, 100 CLOSE #2 IF outfile = infile THEN KILL infile NAME "temp.tmp" AS infile END IF ERASE char() END SUB ' ------------------------------------------------------------------------------ SUB processtag IF LEFT$(t, 10) = "" newline = 1 ' flag a new line starting ELSEIF LEFT$(t, 6) = " PRINT #2, LEFT$(tag, 5) + ">" newline = 1 ' flag a new line starting ' -------------------------------------------------- ' conversions to simpler tags ELSEIF LEFT$(t, 4) = "

"; ELSEIF LEFT$(t, 4) = "

"; ELSEIF LEFT$(t, 4) = "

"; ELSEIF LEFT$(t, 4) = "

"; ELSEIF LEFT$(t, 4) = "

"; ELSEIF LEFT$(t, 4) = "
"; ELSEIF LEFT$(t, 4) = "
  • "; ' -------------------------------------------------- ' paragraph options ELSEIF LEFT$(t, 3) = "

    0 THEN centerflag = 1 PRINT #2, "

    "; END IF END IF ELSEIF t = "

    " THEN IF centerflag THEN centerflag = 0 PRINT #2, "
    " newline = 1 ' flag a new line starting ELSE IF rrrpa = 1 THEN ' if retaining PRINT #2, tag newline = 1 ' flag a new line starting ELSEIF rrrpa = 2 THEN ' if removing PRINT #2, "
    " newline = 1 ' flag a new line starting ELSE PRINT #2, "
    " newline = 1 ' flag a new line starting END IF END IF ' -------------------------------------------------- ' special tags ELSEIF LEFT$(t, 4) = "