First of all, let me say that source code should have comments. I decorate my code with lots of useful comments, and I'm a big fan of auto-generating documentation directly from comments in source code. I believe that the closer documentation is to the source code (e.g. in the actual source code), the more helpful and maintainable it is.
The compiler ignores comments when building a program from source. However, if the header files are copied into a program bundle, the comments remain intact, even though they are useless to the end user and just take up space. For example, I develop an Objective-C data structures framework, and the end product contains copies of my public header files. Due to the amount of documentation and comments I write, these files become large enough that they break the 4 KB file size barrier for the file system. (The file system allocates the smallest multiple of 4096 bytes that will accommodate the number of bytes actually in a file.) While a few kilobytes isn't a big deal, there are about 3 dozen header files in the framework, and they add up. (In my case, the comments are nearly 75% of the total size of the header files, on average. Removing most of them drops 120 of the 252 KB total disk space required, nearly a 50% reduction.)
In short, I was looking for a way to remove certain comments only for the copies of header files in the production release of my framework. That way, the aggregate file size of headers in the release version could decrease, and the comments still exist in the development code and repository. (Also, the documentation is really easy to generate locally from the source — using a free tool — and is available online as well.) Further, I decided I'd like to remove only Javadoc-style block comments (/**...*/) but leave the copyright comment at the top, which is in a normal C-style block (/*...*/). I also figured the tool should handle HeaderDoc-style comments (/*!...*/) and normal single-line comments (//...\n).
Suffice it to say, regular expressions were already looking a bit hairy, and I wanted more customization and flexibility. So I wrote my own tool for it, based on finite state machines. I wrote the original version in C (due to the easier debugging facilities), then made a port in Python (which is shorter and not compiled). The code is pretty simple and easy to follow once you know what's going on. I present it here for your use, abuse, and edification.
— Quinn Taylor
Version in Python
#! /usr/bin/python from optparse import OptionParser import os.path import sys parser = OptionParser() parser.add_option("-L", "--line", dest="stripLine", action="store_true", default=False, help="strip single-line comments //...\\n") parser.add_option("-C", "--cstyle", dest="stripCStyle", action="store_true", default=False, help="strip C-style comments /*...*/") parser.add_option("-J", "--javadoc", dest="stripJavadoc", action="store_true", default=False, help="strip Javadoc comments /**...*/") parser.add_option("-H", "--headerdoc", dest="stripHeaderDoc", action="store_true", default=False, help="strip HeaderDoc comments /*!...*/") parser.add_option("--input", dest="inputFile", default="", help="file from which to read input") (options, args) = parser.parse_args() error = False if len(args) != 0: print "ERROR: Invalid non-option arguments:" for arg in args: print " "+arg error = True if not options.stripLine and not options.stripCStyle and \ not options.stripJavadoc and not options.stripHeaderDoc: print "ERROR: Please specify at least one comment style to strip." error = True if options.inputFile == "": print "ERROR: Must specify input file to process using '--input'." error = True elif os.path.exists(options.inputFile) == False: print "ERROR: Specified input file does not exist!" error = True else: file = open(options.inputFile, "r") if error == True: sys.exit() (SOURCE, STRING_LITERAL, CHAR_LITERAL, SLASH, SLASH_STAR, COMMENT_LINE, COMMENT_CSTYLE, COMMENT_JAVADOC, COMMENT_HEADERDOC) = range(9) #state constants state = SOURCE thisChar = '' while (1): prevChar = thisChar thisChar = file.read(1) if not thisChar: break if state == SOURCE: if thisChar == '/': state = SLASH else: if thisChar == '"': state = STRING_LITERAL elif thisChar == '\'': state = CHAR_LITERAL sys.stdout.write(thisChar) elif state == STRING_LITERAL: if thisChar == '"' and prevChar != '\\': state = SOURCE sys.stdout.write(thisChar) elif state == CHAR_LITERAL: if thisChar == '\'' and prevChar != '\\': state = SOURCE sys.stdout.write(thisChar) elif state == SLASH: if thisChar == '*': state = SLASH_STAR elif thisChar == '/': if not options.stripLine: sys.stdout.write("//") state = COMMENT_LINE else: sys.stdout.write("/") sys.stdout.write(thisChar) state = SOURCE elif state == SLASH_STAR: if thisChar == '*': if not options.stripJavadoc: sys.stdout.write("/**") state = COMMENT_JAVADOC elif thisChar == '!': if not options.stripHeaderDoc: sys.stdout.write("/*!") state = COMMENT_HEADERDOC else: if not options.stripCStyle: sys.stdout.write("/*") sys.stdout.write(thisChar) state = COMMENT_CSTYLE thisChar = 0 # Don't treat "/*/" as a valid block comment elif state == COMMENT_LINE: if thisChar == '\n': sys.stdout.write("\n") state = SOURCE if not options.stripLine: sys.stdout.write(thisChar) elif state == COMMENT_CSTYLE: if not options.stripCStyle: sys.stdout.write(thisChar) if prevChar == '*' and thisChar == '/': state = SOURCE elif state == COMMENT_JAVADOC: if not options.stripJavadoc: sys.stdout.write(thisChar) if prevChar == '*' and thisChar == '/': state = SOURCE elif state == COMMENT_HEADERDOC: if not options.stripHeaderDoc: sys.stdout.write(thisChar) if prevChar == '*' and thisChar == '/': state = SOURCE file.close()
Version in C
#import <getopt.h> #import <stdio.h> typedef enum { SOURCE, STRING_LITERAL, CHAR_LITERAL, SLASH, SLASH_STAR, COMMENT_LINE, COMMENT_CSTYLE, COMMENT_JAVADOC, COMMENT_HEADERDOC } State; #define YES 1 #define NO 0 int main (int argc, const char *argv[]) { char *inputFile = 0; int stripLine = 0, stripCStyle = 0, stripJavadoc = 0, stripHeaderDoc = 0; unsigned errors = 0; #pragma mark Process command-line options static struct option long_options[] = { {"line", no_argument, 0, 'L'}, {"cstyle", no_argument, 0, 'C'}, {"javadoc", no_argument, 0, 'J'}, {"headerdoc", no_argument, 0, 'H'}, {"input", required_argument, 0, 'i'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; int option_index = 0; // getopt_long() stores the option index here char option; // the short character for the last processed option while ((option = getopt_long(argc, (char**)argv, "LCJHi:h", long_options, &option_index)) != -1) { if (option == 'L') stripLine = YES; else if (option == 'C') stripCStyle = YES; else if (option == 'J') stripJavadoc = YES; else if (option == 'H') stripHeaderDoc = YES; else if (option == 'i') inputFile = optarg; else if (option == 'h') errors++; // Will cause help to print, then exit before processing } #pragma mark Handle any options errors if (stripLine + stripCStyle + stripJavadoc + stripHeaderDoc == 0) { printf(" ERROR: Must specify at least one comment style.\n"); printf(" (Options include -L, -C, -J, and -H.)\n"); errors++; } if (inputFile == NULL) { printf(" ERROR: Must specify input file to process.\n"); errors++; } if (optind < argc) { printf(" ERROR: Invalid non-option arguments:"); while (optind < argc) printf(" `%s'", argv[optind++]); printf("\n"); errors++; } if (errors > 0) { printf("\nusage: StripComments [options] --input file\n\n"); printf(" Utility for stripping comments from source code. An input\n"); printf(" file must be specified. If an output file is not specified,\n"); printf(" output is printed to standard output.\n\n"); printf("Valid options:\n"); printf(" -L [--line] : Strip single-line comments //...\\n\n"); printf(" -C [--cstyle] : Strip C-style comments /*...*/\n"); printf(" -J [--javadoc] : Strip Javadoc comments /**...*/\n"); printf(" -H [--headerdoc] : Strip HeaderDoc comments /*!...*/\n"); printf(" -i [--input] : File from which to read input\n"); printf("\n"); return -1; } #pragma mark Strip comments from input FILE *file = fopen(inputFile, "r"); char prevChar = '', thisChar = ''; State currentState = SOURCE; while ((thisChar = fgetc(file)) != EOF) { switch (currentState) { case SOURCE: if (thisChar == '/') currentState = SLASH; else { if (thisChar == '"') currentState = STRING_LITERAL; else if (thisChar == '\'') currentState = CHAR_LITERAL; printf("%C", thisChar); } break; case STRING_LITERAL: if (thisChar == '"' && prevChar != '\\') currentState = SOURCE; printf("%C", thisChar); break; case CHAR_LITERAL: if (thisChar == '\'' && prevChar != '\\') currentState = SOURCE; printf("%C", thisChar); break; case SLASH: if (thisChar == '*') { currentState = SLASH_STAR; } else if (thisChar == '/') { if (!stripLine) printf("//"); currentState = COMMENT_LINE; } else { printf("/%C", thisChar); currentState = SOURCE; } break; case SLASH_STAR: if (thisChar == '*') { if (!stripJavadoc) printf("/**"); currentState = COMMENT_JAVADOC; } else if (thisChar == '!') { if (!stripHeaderDoc) printf("/*!"); currentState = COMMENT_HEADERDOC; } else { if (!stripCStyle) printf("/*%C", thisChar); currentState = COMMENT_CSTYLE; thisChar = 0; // Don't treat "/*/" as a valid block comment } break; case COMMENT_LINE: if (thisChar == '\n') { printf("\n"); currentState = SOURCE; } if (!stripLine) printf("%C", thisChar); break; case COMMENT_CSTYLE: if (!stripCStyle) printf("%C", thisChar); if (prevChar == '*' && thisChar == '/') currentState = SOURCE; break; case COMMENT_JAVADOC: if (!stripJavadoc) printf("%C", thisChar); if (prevChar == '*' && thisChar == '/') currentState = SOURCE; break; case COMMENT_HEADERDOC: if (!stripHeaderDoc) printf("%C", thisChar); if (prevChar == '*' && thisChar == '/') currentState = SOURCE; break; } prevChar = thisChar; } if (thisChar != '\n') printf("\n", thisChar); fclose(file); return 0; }