You are on page 1of 28

PROGRAM NO: - 1

Unit/Topic: I /Lexical Analyzer.

PROBLEM DEFINITION:
Develop a lexical analyzer to recognize a few patterns in C.

OBJECTIVE:
To understand Goal , lexical Structure & Implementation of lexical Analyzer.

ALGORITHM: Lexical Structure:


generates a C-language scanner from a source specification that you write. This specification contains a list of rules indicating sequences of characters -- expressions -to be searched for in an input text, and the actions to take when an expression is found.
Lex

The C source code for the lexical analyzer is generated when you enter $ lex lex.l where lex.l is the file containing your lex specification. (The name lex.l is conventionally the favorite, but you can use whatever name you want. Keep in mind, though, that the .l suffix is a convention recognized by other system tools, make in particular.) The source code is written to an output file called lex.yy.c by default. That file contains the definition of a function called yylex() that returns 1 whenever an expression you have specified is found in the input text, 0 when end of file is encountered. Each call to yylex() parses one token (assuming a return); when yylex() is called again, it picks up where it left off. The resulting executable file reads stdin and writes its output to stdout

Lex Working

Program:
/**************************************************************** Necessary Header files used in program. *****************************************************************/ #include<stdio.h> #include<string.h> #include<conio.h> #include<ctype.h> /**************************************************************** Functions prototype. *****************************************************************/ void void int void void void void void void void void void void void void void Open_File(); Demage_Lexeme(); Search(char[256],int); analyze(); Skip_Comment(); Read_String(); Is_Keyword_Or_Not(); Is_Identifier_Or_Not(); Is_Operator_Or_Not(); Read_Number(); Is_Special_Or_Not(); Is_Comparison_Or_Not(); Add_To_Lexical (char[256],int,char[256]); Print_ST(); Print_TOKEN(); Token_Attribute();

/**************************************************************** Data structure used in program. *****************************************************************/ struct lexical { char data[256]; //Value of token. int line[256]; //Line # which token appear in input file. int times; //# of times that token appear in input file. char type[256]; //Type of each token. struct lexical *next; };

typedef struct lexical Lex; typedef Lex *lex; /**************************************************************** File pointer for accessing the file. *****************************************************************/ FILE *fp; FILE *st; FILE *token; char lexeme[256],ch; int f,flag,line=1,i=1; lex head=NULL,tail=NULL; /**************************************************************** Array holding all keywords for checking. *****************************************************************/ char *keywords[]={"procedure","is","begin","end","var","cin","cout","if", "then","else","and","or","not","loop","exit","when", "while","until"}; /**************************************************************** Array holding all arithmetic operations for checking. *****************************************************************/ char arithmetic_operator[]={'+','-','*','/'}; /**************************************************************** Array holding all comparison operations for checking. *****************************************************************/ char *comparison_operator[]={"<",">","=","<=","<>",">="}; /**************************************************************** Array holding all special for checking. *****************************************************************/ char special[]={'%','!','@','~','$'}; /**************************************************************** ************** *MAIN PROGRAM* ************** *****************************************************************/ void main() { Open_File(); analyze(); fclose(fp); Print_ST(); Print_TOKEN(); }

/**************************************************************** This function open input sourse file. *****************************************************************/ void Open_File() { fp=fopen("source.txt","r"); //provide path for source.txt here if(fp==NULL) { printf("!!!Can't open input file - source.txt!!!"); getch(); exit(0); } } /**************************************************************** Function to add item to structure of array to store data and information of lexical items. *****************************************************************/ void Add_To_Lexical (char value[256],int line,char type[256]) { lex new_lex; if (!Search(value,line)) { //When return 1 the token not found.

new_lex=malloc(sizeof(Lex)); if (new_lex!=NULL) { strcpy(new_lex->data,value); new_lex->line[0]=line; new_lex->times=1; strcpy(new_lex->type,type); new_lex->next=NULL; if (head==NULL) head=new_lex; else tail->next=new_lex; tail=new_lex; } }

} /**************************************************************** Function to search token. *****************************************************************/ int Search (char value[256],int line) { lex x=head; int flag=0; while (x->next!=NULL && !flag) { if (strcmp(x->data,value)==0) { x->line[x->times]=line; x->times++; flag=1; } x=x->next; } return flag; } /**************************************************************** Function to print the ST.TXT . *****************************************************************/ void Print_ST() { lex x=head; int j; if ((st=fopen("ST.TXT","w"))==NULL) printf("The file ST.TXT cat not open. "); else { fprintf(st," %s %s %s ","Line#","Lexeme","Type"); fprintf(st," --------"); while (x!=NULL) ----

{ if ((strcmp(x->type,"num")==0) || (strcmp(x->type,"keyword")==0) || (strcmp(x->type,"identifier")==0)) { fprintf(st," "); for (j=0;j<x->times;j++) { fprintf(st,"%d",x->line[j]); if (j!=x->times-1) //This condition to prevent the comma fprintf(st,",",x->line[j]); //"," to not print after last line #. } fprintf(st," ",x->data,x->type); } x=x->next; } fclose(st); } } /**************************************************************** Function to print the TOKENS.TXT . *****************************************************************/ void Print_TOKEN() { int flag=0; fp=fopen("source.txt","r"); if(fp==NULL) { printf("!!!Can't open input file - source.txt!!!"); getch(); exit(0); } else { if ((token=fopen("TOKENS.TXT","w"))==NULL) printf("The file ST.TXT cat not open. %-6s %-6s

"); else { ch=fgetc(fp); while (!(feof(fp))) { if (ch==' ' && !flag) { do ch=fgetc(fp); while (ch==' '); fseek(fp,-2,1); ch=fgetc(fp); flag=1; } if (ch!=' ' && ch!=' ') fprintf(token,"%c",ch); if (ch==' ') { fprintf(token," "); Token_Attribute(); i++; flag=0; } ch=fgetc(fp); } } } fclose(fp); fclose(token); } /**************************************************************** Function to put the token and atrribute in TOKENS.TXT . *****************************************************************/

void Token_Attribute() { lex x=head; int j; while (x!=NULL) { if (x->line[0]==i) { fprintf(token,"token : %-4s

",x->type);

if ((strcmp(x->type,"num")==0) || (strcmp(x->type,"keyword")==0) || (strcmp(x->type,"identifier")==0)) { fprintf(token,"attribute : line#=%-4d ",i); } else { fprintf(token,"attribute : %-4s ",x->data); } } x=x->next; } fprintf(token," "); } /**************************************************************** Function to create lexical analysis. *****************************************************************/ void analyze() { ch=fgetc(fp); while(!feof(fp)) { //Read character. //While the file is not end.

if(ch==' ') . { line++; ch=fgetc(fp); } if(isspace(ch) && ch==' ') { line++; ch=fgetc(fp); } if(isspace(ch) && ch!=' ') //The character is space. ch=fgetc(fp); if(ch=='/' || ch=='"') file Skip_Comment(); //and '"' with display statements. if(isalpha(ch)) //The character is leter. { Read_String(); Is_Keyword_Or_Not(); Is_Operator_Or_Not(); Is_Identifier_Or_Not(); } if(isdigit(ch)) //The character is digit. Read_Number(); if (ch==';') //The character is semicolon. Add_To_Lexical(";",line,"semicolon"); if (ch==':') //The character is colon. Add_To_Lexical(":",line,"colon"); if (ch==',') //The character is comma. //Function for skipping comments in the //Compute # of lines in source.txt

Add_To_Lexical(",",line,"comma"); if (ch=='(') //The character is parenthesis. Add_To_Lexical("(",line,"parenthesis"); if (ch==')') //The character is parenthesis. Add_To_Lexical(")",line,"parenthesis"); //The character is comparison_operator if (ch=='<' || ch=='=' || ch=='>') Is_Comparison_Or_Not(); Is_Special_Or_Not(); Demage_Lexeme(); if(isspace(ch) && ch==' ') { line++; ch=fgetc(fp); } else ch=fgetc(fp); } } /**************************************************************** This function read all character of strings. *****************************************************************/ void Read_String() { int j=0; do { lexeme[j++]=ch; ch=fgetc(fp); } while(isalpha(ch)); fseek(fp,-1,1); lexeme[j]=' //After failed scaning in before cases //check the character is special or not.

INPUT SET:
The input for the lexical analyzer is a textfile SOURCE.TXT consisting of several lines of text (a program) being a correctly formed sequence of lexemes corresponding to the above definitions, whitespaces and comments

OUTPUT SET:
The output of your lexical analyzer consists of 2 text files ST.TXT & TOKENS.TXT. 1. ST.TXT is the symbol table created by the lexical analyzer. Each line consists of three parts: - line number - the lexeme (string) - type (string) , being one of the following: keyword, identifier, num 2. TOKENS.TXT is the list of tokens produced by the lexical analyzer with the following structure: - one line of input (in the order of appearance in SOURCE.TXT) - corresponding pairs token, attribute, each in a separate line in the order as they occur in the line - blank line The attribute of a keyword, identifier or a number is the line number in the symbol table. The attribute of any other token is the lexeme itself. The longest prefix of the input that can match any regular expression pi is taken as the next token.

NOTES:

NAME OF FACULTY: SIGNATURE: DATE:

PROGRAM NO: - 2
Unit/Topic: Linux

PROBLEM DEFINITION:
Write a programme to parse using Brute force technique of Top down parsing.

OBJECTIVE:
To understand Brute force technique.

ALGORITHM:
Top-down parsing: The top down construction of a parse tree is done by starting with the root ,labeled with the starting non-terminal ,and repeatedly performing the following two steps1. at node n, labeled with non-terminal A,select one of the productions for A and construct children at n for the symbols on the right side of the production 2. Find the next node at which the subtree is constructed. For some grammars, the above steps can be implemented during a single left to right scan of the input string. The current token being scanned on the input is often called as the lookahead symbol . Initially the lookahead symbol is the first i.e the leftmost token of the input string. Let us consider the following grammar. A -> BA |a | aa B -> BB |b and now consider the input string bbaa The top-down parsing would look like this in different steps1. A

input : bbaa , the highlighted character indicates the lookahead. 2. B 3. B B 4. B B b 5. input: bbaa A B B b 6. input: bbaa B B b B b a B b A A a A B B A A A A A A

Here we have assumed that , at the first attempt the parser would know which production to use to get the right output, but in general, the selection of a production of a non-terminal may involve trial and error, that is we may have to try a production and backtrack to try another production if the first is found to be unsuitable. A production is unsuitable ,if after using the production, we cannot complete the tree to match the input string .we will discuss this parsing in the next section.

Brute Force algorithm

Main features

no preprocessing phase; constant extra space needed; always shifts the window by exactly 1 position to the right; comparisons can be done in any order; searching phase in O(mn) time complexity; 2n expected text characters comparisons.

Description
The brute force algorithm consists in checking, at all positions in the text between 0 and n-m, whether an occurrence of the pattern starts there or not. Then, after each attempt, it shifts the pattern by exactly one position to the right. The brute force algorithm requires no preprocessing phase, and a constant extra space in addition to the pattern and the text. During the searching phase the text character comparisons can be done in any order. The time complexity of this searching phase is O(mn) (when searching for am-1b in an for instance). The expected number of text character comparisons is 2n.

The C code
void BF(char *x, int m, char *y, int n) { int i, j; /* Searching */ for (j = 0; j <= n - m; ++j) { for (i = 0; i < m && x[i] == y[i + j]; ++i); if (i >= m) OUTPUT(j); } }

This algorithm can be rewriting to give a more efficient algorithm in practice as follows:
#define EOS '\0' void BF(char *x, int m, char *y, int n) { char *yb; /* Searching */ for (yb = y; *y != EOS; ++y) if (memcmp(x, y, m) == 0) OUTPUT(y - yb); }

INPUT SET: OUTPUT SET: NOTES:

NAME OF FACULTY: SIGNATURE: DATE:

PROGRAM NO: - 3
Unit/Topic: Linux

PROBLEM DEFINITION:
Develop on LL (I) parser (construct parse table also)

OBJECTIVE:
To understand LL(1) parser and construction of parse table.

ALGORITHM:
An LL parser is a top-down parser for a subset of the context-free grammars. It parses the input from Left to right, and constructs a Leftmost derivation of the sentence (hence LL, compared with LR parser). The class of grammars which are parsable in this way is known as the LL grammars.The parser works on strings from a particular context-free grammar. The parser consists of

an input buffer, holding the input string (built from the grammar) a stack on which to store the terminals and non-terminals from the grammar yet to be parsed a parsing table which tells it what (if any) grammar rule to apply given the symbols on top of its stack and the next input token

The parser applies the rule found in the table by matching the top-most symbol on the stack (row) with the current symbol in the input stream (column).When the parser starts, the stack already contains two symbols: [ S, $ ] Where '$' is a special terminal to indicate the bottom of the stack and the end of the input stream, and 'S' is the start symbol of the grammar. The parser will attempt to rewrite the contents of this stack to what it sees on the input stream. However, it only keeps on the stack what still needs to be rewritten. Parser implementation
#include <iostream> #include <map> #include <stack> enum Symbols { // the symbols: // Terminal symbols: TS_L_PARENS, // ( TS_R_PARENS, // )

TS_A, TS_PLUS, TS_EOS, TS_INVALID,

// // // //

a + $, in this case corresponds to '\0' invalid token

// Non-terminal symbols: NTS_S, // S NTS_F }; /* Converts a valid token to the corresponding terminal symbol */ enum Symbols lexer(char c) { switch(c) { case '(': return TS_L_PARENS; break; case ')': return TS_R_PARENS; break; case 'a': return TS_A; break; case '+': return TS_PLUS; break; case '\0': // this will act as the $ terminal symbol return TS_EOS; break; default: return TS_INVALID; break;

} }

int main(int argc, char **argv) { using namespace std; if (argc < 2) { cout << "usage:\n\tll '(a+a)'" << endl; return 0; } map< enum Symbols, map<enum Symbols, int> > table; // LL parser table, maps < non-terminal, terminal> pair to action

stack<enum Symbols> ss; // symbol stack char *p; // input buffer // initialize the symbols stack ss.push(TS_EOS); // terminal, $ ss.push(NTS_S); // non-terminal, S // initialize the symbol stream cursor p = &argv[1][0]; // setup the parsing table table[NTS_S][TS_L_PARENS] = 2; table[NTS_S][TS_A] = 1; table[NTS_F][TS_A] = 3; while(ss.size() > 0) { if(lexer(*p) == ss.top()) { cout << "Matched symbols: " << lexer(*p) << endl; p++; ss.pop(); } else { cout << "Rule " << table[ss.top()][lexer(*p)] << endl; switch(table[ss.top()][lexer(*p)]) { case 1: // 1. S F ss.pop(); ss.push(NTS_F); // F break; case 2: // 2. S ( S + F ) ss.pop(); ss.push(TS_R_PARENS); ss.push(NTS_F); ss.push(TS_PLUS); ss.push(NTS_S); ss.push(TS_L_PARENS); break; case 3: // 3. F a ss.pop(); ss.push(TS_A); // a break; default: cout << "parsing table defaulted" << return 0; break; } } cout << "finished parsing" << endl; return 0; }

// // // // //

) F + S (

endl;

Constructing an LL(1) parsing table In order to fill the parsing table, we have to establish what grammar rule the parser should choose if it sees a nonterminal A on the top of its stack and a symbol a on its input stream. It is easy to see that such a rule should be of the form A w and that the language corresponding to w should have at least one string starting with a. For this purpose we define the First-set of w, written here as Fi(w), as the set of terminals that can be found at the start of any string in w, plus if the empty string also belongs to w. Given a grammar with the rules A1 w1, ..., An wn, we can compute the Fi(wi) and Fi(Ai) for every rule as follows: 1. initialize every Fi(wi) and Fi(Ai) with the empty set 2. add Fi(wi) to Fi(Ai) for every rule Ai wi, where Fi is defined as follows: o Fi(a w' ) = { a } for every terminal a o Fi(A w' ) = Fi(A) for every nonterminal A with not in Fi(A) o Fi(A w' ) = Fi(A) \ { } Fi(w' ) for every nonterminal A with in Fi(A) o Fi() = { } 3. add Fi(wi) to Fi(Ai) for every rule Ai wi 4. do steps 2 and 3 until all Fi sets stay the same.

INPUT SET: OUTPUT SET:

NOTES: As can be seen from the example the parser performs three types of steps depending on
whether the top of the stack is a nonterminal, a terminal or the special symbol $: If the top is a nonterminal then it looks up in the parsing table on the basis of this nonterminal and the symbol on the input stream which rule of the grammar it should use to replace it with on the stack. The number of the rule is written to the output stream. If the parsing table indicates that there is no such rule then it reports an error and stops. If the top is a terminal then it compares it to the symbol on the input stream and if they are equal they are both removed. If they are not equal the parser reports an error and stops. If the top is $ and on the input stream there is also a $ then the parser reports that it has successfully parsed the input, otherwise it reports an error. In both cases the parser will stop.

These steps are repeated until the parser stops, and then it will have either completely parsed the input and written a leftmost derivation to the output stream or it will have reported an error.

NAME OF FACULTY: SIGNATURE: DATE:

PROGRAM NO: - 4
Unit/Topic:

PROBLEM DEFINITION:
Develop an operator precedence parser.

OBJECTIVE:
To understand operator precedence parser.

ALGORITHM:
An operator precedence parser is a bottom-up parser that interprets an operatorprecedence grammar. For example, most calculators use operator precedence parsers to convert from the human-readable infix notation with order of operations format into an internally optimized computer-readable format like Reverse Polish notation (RPN).

Example execution of the algorithm


An example execution on the expression 2 + 3 * 4 + 5 == 19 is as follows. We give precedence 0 to equality expressions, 1 to additive expressions, 2 to multiplicative expressions. parse_expression_1 (lhs = 2, min_precedence = 0)

the next token is +, with precedence 1. the while loop is entered. op is + (precedence 1) rhs is 3 the next token is *, with precedence 2. recursive invocation. parse_expression_1 (lhs = 3, min_precedence = 2)

the next token is *, with precedence 2. the while loop is entered. op is * (precedence 2) rhs is 4 the next token is +, with precedence 1. no recursive invocation. lhs is assigned 3*4 = 12 the next token is +, with precedence 1. the while loop is left. 12 is returned.

the next token is +, with precedence 1. no recursive invocation. lhs is assigned 2+12 = 14 the next token is +, with precedence 1. the while loop is not left. op is + (precedence 1)

rhs is 5 the next token is ==, with precedence 0. no recursive invocation. lhs is assigned 14+5 = 19 the next token is ==, with precedence 0. the while loop is not left. op is == (precedence 0) rhs is 19 the next token is end-of-line, which is not an operator. no recursive invocation. lhs is assigned the result of evaluating 19 == 19, for example 1 (as in the C standard). the next token is end-of-line, which is not an operator. the while loop is left.

1 is returned. #include <stdio.h> int main(int argc, char *argv[]){ int i; printf("(((("); for(i=1;i!=argc;i++){ if(argv[i] && !argv[i][1]){ switch(*argv[i]){ case '(': printf("(((("); continue; case ')': printf("))))"); continue; case '^': printf(")^("); continue; case '*': printf("))*(("); continue; case '/': printf("))/(("); continue; case '+': if (i == 1 || strchr("(^*/+-", *argv[i-1])) printf("+"); else printf(")))+((("); continue; case '-': if (i == 1 || strchr("(^*/+-", *argv[i-1])) printf("-"); else printf(")))-((("); continue; } } printf("%s", argv[i]); } printf("))))\n"); return 0; }

INPUT SET: OUTPUT SET: NOTES:

NAME OF FACULTY: SIGNATURE: DATE:

PROGRAM NO: - 5
Unit/Topic:

PROBLEM DEFINITION:
Develop a recursive descent parser.

OBJECTIVE:
To understand recursive descent parser. .

ALGORITHM:
Recursive descent parser is a top-down parser built from a set of mutually-recursive procedures (or a non-recursive equivalent) where each such procedure usually implements one of the production rules of the grammar. Thus the structure of the resulting program closely mirrors that of the grammar it recognizes. A predictive parser is a recursive descent parser that does not require backtracking. Predictive parsing is possible only for the class of LL(k) grammars, which are the context-free grammars for which there exists some positive integer k that allows a recursive descent parser to decide which production to use by examining only the next k tokens of input. (The LL(k) grammars therefore exclude all ambiguous grammars, as well as all grammars that contain left recursion. Any context-free grammar can be transformed into an equivalent grammar that has no left recursion, but removal of left recursion does not always yield an LL(k) grammar.) A predictive parser runs in linear time. Recursive descent with backup is a technique that determines which production to use by trying each production in turn. Recursive descent with backup is not limited to LL(k) grammars, but is not guaranteed to terminate unless the grammar is LL(k). Even when they terminate, parsers that use recursive descent with backup may require exponential time. Although predictive parsers are widely used, programmers often prefer to create LR or LALR parsers via parser generators without transforming the grammar into LL(k) form. C implementation What follows is an implementation of a recursive descent parser for the above language in C. The parser reads in source code, and exits with an error message if the code fails to parse, exiting silently if the code parses correctly.

Notice how closely the predictive parser below mirrors the grammar above. There is a procedure for each nonterminal in the grammar. Parsing descends in a top-down manner, until the final nonterminal has been processed. The program fragment depends on a global variable, sym, which contains the next symbol from the input, and the function getsym, which updates sym when called. The implementations of the functions getsym and error are omitted for simplicity. typedef enum {ident, number, lparen, rparen, times, slash, plus, minus, eql, neq, lss, leq, gtr, geq, callsym, beginsym, semicolon, endsym, ifsym, whilesym, becomes, thensym, dosym, constsym, comma, varsym, procsym, period, oddsym} Symbol; Symbol sym; void getsym(void); void error(const char msg[]); void expression(void); int accept(Symbol s) { if (sym == s) { getsym(); return 1; } return 0; } int expect(Symbol s) { if (accept(s)) return 1; error("expect: unexpected symbol"); return 0; } void factor(void) { if (accept(ident)) { ; } else if (accept(number)) { ; } else if (accept(lparen)) { expression(); expect(rparen); } else { error("factor: syntax error"); getsym(); } }

void term(void) { factor(); while (sym == times || sym == slash) { getsym(); factor(); } } void expression(void) { if (sym == plus || sym == minus) getsym(); term(); while (sym == plus || sym == minus) { getsym(); term(); } } void condition(void) { if (accept(oddsym)) { expression(); } else { expression(); if (sym == eql || sym == neq || sym == lss || sym == leq || sym == gtr || sym == geq) { getsym(); expression(); } else { error("condition: invalid operator"); getsym(); } } } void statement(void) { if (accept(ident)) { expect(becomes); expression(); } else if (accept(callsym)) { expect(ident); } else if (accept(beginsym)) { do { statement(); } while (accept(semicolon)); expect(endsym);

} else if (accept(ifsym)) { condition(); expect(thensym); statement(); } else if (accept(whilesym)) { condition(); expect(dosym); statement(); } } void block(void) { if (accept(constsym)) { do { expect(ident); expect(eql); expect(number); } while (accept(comma)); expect(semicolon); } if (accept(varsym)) { do { expect(ident); } while (accept(comma)); expect(semicolon); } while (accept(procsym)) { expect(ident); expect(semicolon); block(); expect(semicolon); } statement(); } void program(void) { getsym(); block(); expect(period); }

INPUT SET:

OUTPUT SET: NOTES:

NAME OF FACULTY: SIGNATURE: DATE:

You might also like