Windows command line parsing is rather wonky, since it's done by the C runtime library (you can examine the code in your Visual Studio install directory at something like
- C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\crt\src\stdargv.c
The actual path on your machine may vary, of course. And, it won't be their if you didn't install the C runtime sources with Visual Studio.
I believe the "logic" such as it is was inherited from DOS, so it's rather crufty.
The basic grammar goes something like this:
A Command Line is a sequence of 1 or more WORDS separated by Whitespace.
Each Word is a sequence of 1 or more of the following: BARE_WORD, QUOTED_WORD or ESCAPE_SEQUENCE. Words are terminated by whitespace or the end of the command line.
A BARE_WORD is a sequence of 1 or more characters other than backslash ('\'), double-quote ('"') or whitespace.
A QUOTED_WORD is introduced by a LEAD_IN_QUOTE ('"'), followed by zero or more of the following:
- Whitespace
- ESCAPE_SEQUENCE
- BAREWORD
and is terminated by a LEAD_OUT_QUOTE ('"'). The lead-in and lead-out quotes are removed from the quoted word.
An ESCAPE_SEQUENCE is one of the following constructs:
- An even number of backslashes ('"'), followed by a quotation mark ('"').
This represents a series of escaped backslashes followed by a lead-in/lead-out quote. Each pair of backslashes represents a single backslash.
- An odd number of backslashes, followed by a quotation mark ('"').
This represents a series of escaped backslashes, followed by a literal quotation mark.
- A sequence of backslashes, that is not followed by a quotation mark.
This represents a series of unescaped backslashes and is passed through as-is.
That's about it.
The first word on the command line is the command name (e.g., the name/path of the executable). Strictly speaking parsing the command name should be simpler than the other words as it must be represent a valid NTFS file name. That's not necessarily true, however, depending on who composed the command line.
Here's some sample C# code that should parse any given command line in the same way that the Windows OS does, though I should note that this has not been throughly tested.
The method Parse() returns an IEnumerable<string>, the first element of which is the command/program name, with the remainder the words that make up the arguments.
class CommandLineParser
{
char[] cmd; // source buffer
StringBuilder buf; // output buffer
int i; // current position within the source buffer
public CommandLineParser()
{
cmd = null;
buf = null;
i = -1;
return;
}
public IEnumerable<string> Parse( string commandLine )
{
cmd = commandLine.ToCharArray();
buf = new StringBuilder();
i = 0;
while ( i < cmd.Length )
{
char ch = cmd[i];
if ( char.IsWhiteSpace( ch ) ) { throw new InvalidOperationException(); }
else if ( ch == '\\' ) { ParseEscapeSequence(); }
else if ( ch == '"' ) { ParseQuotedWord(); }
else { ParseBareWord(); }
if ( i >= cmd.Length || char.IsWhiteSpace( cmd[i] ) )
{
string arg = buf.ToString();
yield return arg;
buf.Length = 0;
ConsumeWhitespace();
}
}
}
/// <summary>
/// Parse a quoted word
/// </summary>
private void ParseQuotedWord()
{
// scan over the lead-in quotation mark w/o adding it to the buffer
++i;
// scan the contents of the quoted word into the buffer
while ( i < cmd.Length && cmd[i] != '"' )
{
char ch = cmd[i];
if ( ch == '\\' ) { ParseEscapeSequence(); }
else { buf.Append( ch ); ++i; }
}
// scan over the lead-out quotation mark w/o adding it to the buffer
if ( i < cmd.Length )
{
++i;
}
return;
}
/// <summary>
/// Parse a bareword
/// </summary>
private void ParseBareWord()
{
while ( i < cmd.Length )
{
char ch = cmd[i];
if ( char.IsWhiteSpace( ch ) ) break; // whitespace terminates a bareword
else if ( ch == '"' ) break; // lead-in quote starts a quoted word
else if ( ch == '\\' ) break; // escape sequence terminates the bareword
buf.Append(ch); // otherwise, keep reading this word
++i;
}
return;
}
/// <summary>
/// Parse an escape sequence of one or more backslashes followed an an optional trailing quotation mark
/// </summary>
private void ParseEscapeSequence()
{
//---------------------------------------------------------------------------------------------------------
// The rule is that:
//
// * An even number of backslashes followed by a quotation mark ('"') means that
// - the backslashes are escaped, so half that many get injected into the buffer, and
// - the quotation mark is a lead-in/lead-out quotation mark that marks the start of a quoted word
// which does not get added to the buffer.
//
// * An odd number of backslashes followed by a quotation mark ('"') means that
// - the backslashes are escaped, so half that many get injected into the buffer, and
// - the quotation mark is escaped. It's a literal quotation mark that also gets injected into the buffer
//
// * Any number of backslashes that aren't followed by a quotation mark ('"') have no special meaning:
// all of them get added to the buffer as-sis.
//
//---------------------------------------------------------------------------------------------------------
//
// scan in the backslashes
//
int p = i; // start of the escape sequence
while ( i < cmd.Length && cmd[i] == '\\' )
{
buf.Append( '\\' );
++i;
}
//
// if the backslash sequence is followed by a quotation mark, it's an escape sequence
//
if ( i < cmd.Length && cmd[i] == '"' )
{
int n = ( i - p ); // find the number of backslashes seen
int quotient = n >> 1; // n divide 2 ( 5 div 2 = 2 , 6 div 2 = 3 )
int remainder = n & 1; // n modulo 2 ( 5 mod 2 = 1 , 6 mod 2 = 0 )
buf.Length -= ( quotient + remainder ); // remove the unwanted backslashes
if ( remainder != 0 )
{
// the trailing quotation mark is an escaped, literal quotation mark
// add it to the buffer and increment the pointer
buf.Append( '"' );
++i;
}
}
return;
}
/// <summary>
/// Consume inter-argument whitespace
/// </summary>
private void ConsumeWhitespace()
{
while ( i < cmd.Length && char.IsWhiteSpace( cmd[i] ) )
{
++i;
}
return;
}
}
class Program
{
static void Main()
{
CommandLineParser parser = new CommandLineParser();
string commandLine = RetrieveUnparsedCommandLine();
int i = 0;
IEnumerable<string> args = parser.Parse( commandLine );
Console.WriteLine( "-------------------" );
foreach ( string arg in args )
{
string template = i > 0 ? "argv[0:#0]" : "command";
string label = string.Format( template , i++ );
Console.WriteLine( "{0}: {1}" , label , arg );
}
Console.WriteLine( "-------------------------" );
return;
}
static string RetrieveUnparsedCommandLine()
{
// get the raw command line. Source might be registry, config file, whatever
string commandLine = Environment.CommandLine;
return commandLine;
}
}
Good Luck.