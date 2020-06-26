GNU/Linux system administrator and programmer
consists of two words:
ls -l
(the command's name), and
ls
(the first and sole argument). Similarly, the command:
-l
(which we used in part I to compile our shell) consists of 5 words: a command name, and a list of 4 arguments.
gcc -o shell main.c prompt.c
and
;
.
&
function, which we defined in part I of this tutorial? That was the function we used to read user's input and return it as an
read_cmd()
'd string. We could pass this string directly to our scanner, but that would make the scanning process a bit cumbersome. In particular, it would be very difficult for the scanner to remember the last character it gave us, so that it can move past that character and give us the character that follows.
malloc
structure, which we'll define in the source file
struct source_s
. Go ahead and create that file in your source directory, then open it in your favorite text editor and add the following code:
source.h
#ifndef SOURCE_H
#define SOURCE_H
#define EOF (-1)
#define ERRCHAR ( 0)
#define INIT_SRC_POS (-2)
struct source_s
{
char *buffer; /* the input text */
long bufsize; /* size of the input text */
long curpos; /* absolute char position in source */
};
char next_char(struct source_s *src);
void unget_char(struct source_s *src);
char peek_char(struct source_s *src);
void skip_white_spaces(struct source_s *src);
#endif
contains a pointer to the input string, in addition to a two
struct source_s
fields that hold information about the string's length and our current position in the string (where we'll get the next character from).
long
, to which you should add the following code:
source.c
#include <errno.h>
#include "shell.h"
#include "source.h"
void unget_char(struct source_s *src)
{
if(src->curpos < 0)
{
return;
}
src->curpos--;
}
char next_char(struct source_s *src)
{
if(!src || !src->buffer)
{
errno = ENODATA;
return ERRCHAR;
}
char c1 = 0;
if(src->curpos == INIT_SRC_POS)
{
src->curpos = -1;
}
else
{
c1 = src->buffer[src->curpos];
}
if(++src->curpos >= src->bufsize)
{
src->curpos = src->bufsize;
return EOF;
}
return src->buffer[src->curpos];
}
char peek_char(struct source_s *src)
{
if(!src || !src->buffer)
{
errno = ENODATA;
return ERRCHAR;
}
long pos = src->curpos;
if(pos == INIT_SRC_POS)
{
pos++;
}
pos++;
if(pos >= src->bufsize)
{
return EOF;
}
return src->buffer[pos];
}
void skip_white_spaces(struct source_s *src)
{
char c;
if(!src || !src->buffer)
{
return;
}
while(((c = peek_char(src)) != EOF) && (c == ' ' || c == '\t'))
{
next_char(src);
}
}
function returns (or ungets) the last character we've retrieved from input, back to the input source. It does this by simply manipulating the source structure's pointers. You'll see the benefit of this function later in this series.
unget_char()
function returns the next character of input and updates the source pointer, so that the next call to
next_char()
returns the following input character. When we reach the last character in input, the function returns the special character
next_char()
, which we defined as -1 in
EOF
above.
source.h
function is similar to
peek_char()
in that it returns the next character of input. The only difference is that
next_char()
doesn't update the source pointer, so that the next call to
peek_char()
returns the same input character we've just peeked at. You'll see the benefit of input peeking later in this series.
next_char()
function skips all whitespace characters. This will help us when we've finished reading a token, and we want to skip delimiter whitespaces before we read the next token.
skip_white_spaces()
in your source directory, then open it and add the following code:
scanner.h
#ifndef SCANNER_H
#define SCANNER_H
struct token_s
{
struct source_s *src; /* source of input */
int text_len; /* length of token text */
char *text; /* token text */
};
/* the special EOF token, which indicates the end of input */
extern struct token_s eof_token;
struct token_s *tokenize(struct source_s *src);
void free_token(struct token_s *tok);
#endif
contains a pointer to the
struct token_s
that holds our input. The structure also contains a pointer to the token's text, and a field that tells us the length of that text (so that we don't need to repeatedly call
struct source_s
on the token's text).
strlen()
function, which will retrieve the next token from input. We'll also write some helper functions that will help us work with input tokens.
tokenize()
, and enter the following code:
scanner.c
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include "shell.h"
#include "scanner.h"
#include "source.h"
char *tok_buf = NULL;
int tok_bufsize = 0;
int tok_bufindex = -1;
/* special token to indicate end of input */
struct token_s eof_token =
{
.text_len = 0,
};
void add_to_buf(char c)
{
tok_buf[tok_bufindex++] = c;
if(tok_bufindex >= tok_bufsize)
{
char *tmp = realloc(tok_buf, tok_bufsize*2);
if(!tmp)
{
errno = ENOMEM;
return;
}
tok_buf = tmp;
tok_bufsize *= 2;
}
}
struct token_s *create_token(char *str)
{
struct token_s *tok = malloc(sizeof(struct token_s));
if(!tok)
{
return NULL;
}
memset(tok, 0, sizeof(struct token_s));
tok->text_len = strlen(str);
char *nstr = malloc(tok->text_len+1);
if(!nstr)
{
free(tok);
return NULL;
}
strcpy(nstr, str);
tok->text = nstr;
return tok;
}
void free_token(struct token_s *tok)
{
if(tok->text)
{
free(tok->text);
}
free(tok);
}
struct token_s *tokenize(struct source_s *src)
{
int endloop = 0;
if(!src || !src->buffer || !src->bufsize)
{
errno = ENODATA;
return &eof_token;
}
if(!tok_buf)
{
tok_bufsize = 1024;
tok_buf = malloc(tok_bufsize);
if(!tok_buf)
{
errno = ENOMEM;
return &eof_token;
}
}
tok_bufindex = 0;
tok_buf[0] = '\0';
char nc = next_char(src);
if(nc == ERRCHAR || nc == EOF)
{
return &eof_token;
}
do
{
switch(nc)
{
case ' ':
case '\t':
if(tok_bufindex > 0)
{
endloop = 1;
}
break;
case '\n':
if(tok_bufindex > 0)
{
unget_char(src);
}
else
{
add_to_buf(nc);
}
endloop = 1;
break;
default:
add_to_buf(nc);
break;
}
if(endloop)
{
break;
}
} while((nc = next_char(src)) != EOF);
if(tok_bufindex == 0)
{
return &eof_token;
}
if(tok_bufindex >= tok_bufsize)
{
tok_bufindex--;
}
tok_buf[tok_bufindex] = '\0';
struct token_s *tok = create_token(tok_buf);
if(!tok)
{
fprintf(stderr, "error: failed to alloc buffer: %s\n", strerror(errno));
return &eof_token;
}
tok->src = src;
return tok;
}
is a pointer to the buffer in which we'll store the current token.
tok_buf
is the number of bytes we allocate to the buffer.
tok_bufsize
is the current buffer index (i.e. it tells us where to add the next input character in the buffer).
tok_bufindex
is a special token we'll use to signal the end of file/input (EOF).
eof_token
function adds a single character to the token buffer. If the buffer is full, the function takes care of extending it.
add_to_buf()
function takes a string and converts it to a
create_token()
structure. It takes care of allocating memory for the token's structure and text, and fills in the structure's member fields.
struct token_s
function frees the memory used by a token structure, as well as the memory used to store the token's text.
free_token()
function is the heart and soul of our lexical scanner, and the function's code is fairly straight-forward. It starts by allocating memory for our token buffer (if not already done), then initializes our token buffer and source pointers. It then calls
tokenize()
to retrieve the next input character. When we reach the end of input,
next_char()
returns the special
tokenize()
, which marks the end of input.
eof_token
calls
tokenize()
, passing it the token text (which we stored in the buffer). The token text is converted to a token structure, which
create_token()
then returns to the caller.
tokenize()