website/linux/awkessentialtraining/tools.js

import { addBanner, addArticle, addTitle, addHeader, addParagraph, addSubHeader } from '/scripts/article.js';
import { addInset, addInsetList, addInsetCodeListing, addInsetBulletList } from '/scripts/inset.js';
import { addImageWithCaption, addButtonGroup } from '/scripts/visuals.js';
import { addSidebar} from '/scripts/sidebar.js'; 
import { addSyntax } from '/scripts/code.js';
import { menu } from '/scripts/web_dev_buttons.js';
import { global_menu } from '/scripts/grid_layout1.js';
import { local_menu } from '/scripts/linux.js';

const heading = document.querySelector(".heading");
const global = document.querySelector(".global_menu");
const local = document.querySelector(".local_menu");
const sidebar = document.querySelector(".sidebar");
const main = document.querySelector(".main_content");

heading.append(addTitle("AWK Essential Training"));
heading.append(addParagraph("David D. Levine - LinkedIn Learning - November 2022"));
heading.append(addParagraph("Chapter 8 - Combining AWK with Other Tools"));

main.append(addHeader("COMBINING AWK WITH OTHER TOOLS USING PIPES"))
main.append(addParagraph("In the section on Functions and Arrays, we saw an example where the output of an awk command is piped to the sort command so that we can sort the output of the wordusage.awk program.  A pipe is also commonly used to pipe the output of a command that produces too much output to read in the screen to a program that will paginate the output such as less or more."))
main.append(addParagraph("We can also pipe the output from other commands into awk. As an example, a command like ls -l (or ll which is a common alias for ls - l) produces lines of output in a format that awk can interpret. Consider the following example."))
main.append(addSyntax("ls -l | awk '/\.txt$)/{total+=$5; print} END {print total}'"))
main.append(addParagraph("This takes the output of ls - 1 and pipes into to an AWK command. The command itself isn't too important in this context, but it would be good practice to break this down."))
main.append(addParagraph("Firstly , we have a pattern. This will act as a filter which means that we will ignore any line coming from the list command if it doesn't match the pattern. The pattern is fairly simple, this is just a string literal, .txt. Note that the dot is escaped because we only want to match with .txt, If we omitted the backslash ( so the hot isn't being escaped ) it would match with any character so rather than only .txt, it would also match with atxt, ?txt, 3txt and so on. Of course, it would also match with .txt, the important point is that the results would almost always be the same either way but escaping it means that any line that matches it will definitely include the string .txt. Not escaping it means that the line definitely includes the string txt and that there is at least one character before it."))
main.append(addParagraph("The $ anchors the string at the end of the line. In this context, the letters txt would normally appear as a file extension so the dot will probably be there and it will likely be at the end of the line. The point here is that you could probably be a little bit sloppy with the pattern and find that it gives you an accurate result most of the, but will sometimes throw up an inaccurate result so it is important to know exactly what you are trying to match with."))
main.append(addParagraph("The rest of the awk code takes the value of the fifth field, which its the file size and adds it to the total. When all the lines have been parsed, that total is then printed out so this code takes all of the files in a directory (it will also print the line if it matches) and gives us a total size for all the files."))
main.append(addParagraph("The result of running this in a directory containing the Exercise files for this course is shown below."))
main.append(addImageWithCaption("./images/pipe1.jpg", "Piping the output from a directory listing to awk for processing ."))
main.append(addParagraph("Another tool commonly used with AWK is SED. SED makes it easier for us to change the text we are parsing and actually is complex enough to have a course dedicated to it which you can find at <a href='https"))
main.append(addParagraph("It actually does the same thing as the AWK function, sub, but since it is not itself an AWK function, we can use it to filter or process data before passing to AWK to process. We can  also pass the output from an AWK command to SED so it can do some post-processing."))
main.append(addParagraph("If we take a slightly contrived example which is the same as the previous example but we are using ls - F to generate the output and we are looking for any file with an .awk extension."))
main.append(addParagraph("The -F option can add an extra character at the end of each line. For example, it will add an asterisk at the end of the filename if a file is executable."))
main.append(addParagraph("It wouldn't be too difficult to adapt our pattern to ignore the asterisk or any other character that might be there, but since the point is to demonstrate how sed and awk can work together, we will pipe the output to sed and then to awk."))
main.append(addParagraph("The sed command we will use is"))
main.append(addSyntax("s/old/new/"))
main.append(addParagraph("which takes the first occurrence of old in a string and replaces it with new. In this case, we will use an asterisk in place of old and nothing for new which will have the effect of removing the asterisk."))
main.append(addParagraph("The output from the sed command will then be piped to our awk command with a suitable modified pattern that now looks for .awk.  The result is shown below."))
main.append(addImageWithCaption("./images/pipe2.jpg", "Piping a directory output to sed for pre-processing and then to awk for processing."))
main.append(addParagraph("Now, you should bear in mind that pipes can be very versatile and anytime a program produces output that another program can use as input, you can pipe the output from the first program in to the second program."))
main.append(addParagraph("Consider the file, nicknames.txt. It contains a list of names followed by an email address. Let's say that we want to convert this to a list where the email address is in angled brackets. If our data is consistent, by which I mean every line has the same number of fields, it would be easy to do this with a single awk command. The problem is that some of these names have an extra field, a nickname that appears between the first and last name."))
main.append(addParagraph("We can easily remove these with the awk command"))
main.append(addParagraph("That will send it's output to the screen so to get it in a format that we can run a second awk command on, we might write that to a file, possibly overwriting the existing nicknames.txt file. Assuming we do that, we then have consistent data so we can run a command like"))
main.append(addSyntax("awk '{print $1, $2, \"&lt;\" $3 \"&gt;\" }' nicknames.txt"))
main.append(addParagraph("We could write that out to a new file, the same file or we could pipe it to a command like more for a convenient display on the screen."))
main.append(addParagraph("However, there is no reason why we can't connect two awk programs with a pipe so we can do this in one line with"))
main.append(addSyntax("awk '{if(NF==4) {$2=\"\"} ;print}' nicknames.txt | awk '{print $1, $2, \"&lt;\" $3 \"&gt;\" }' | more"))
main.append(addParagraph("You may not think this is the best solution and that is probably true, but it does demonstrate the fact that you can pipe the output from one awk command in to another awk command."))
main.append(addParagraph("It is also another demonstration of the fact that awk will work better with consistent data and highlights the importance of cleaning up your data but it does greatly over simplify the problem by suggesting that if the name has three fields, it is because there is an additional nickname. Of course, some names can take up three or four fields without a nickname or a middle name so there are certainly better ways to do this. for example, you might get the number of fields for each record and then use a loop to print the name fields and put the angled brackets around the last field."))
main.append(addHeader("PARSING EXCEL CSV FILES: LINE ENDINGS AND QUOTING"))
main.append(addParagraph("In theory, it should be quite easy for AWK to parse a CSV file since the file is made up of lines with each line being made up of a set number of fields separated by commas . However, there can be a couple of problems with that."))
main.append(addParagraph("Let's take an example."))
main.append(addImageWithCaption("./images/excel_xls.jpg", "A sample Excel File."))
main.append(addParagraph("The data looks fairly consistent and each line has three columns so that is quite promising but the format is .xls so AWK will not be able to read it.  We will need to save it as a .csv file which awk can read."))
main.append(addParagraph("To start with, we will print out each line with exclamation marks between the fields and one at the end. We will put this in an AWK file containing one line"))
main.append(addSyntax("{print \"!\" $1 \"! \" $2 \"!\" $ 3 \"!\"}"))
main.append(addParagraph("and we will save the file as excel.awk. We will run it with the command"))
main.append(addSyntax("awk -F, -f excel.awk excel.csv"))
main.append(addParagraph("Just as a reminder, -F, indicates that commas are being used as the field separator and -f indicates that we are getting the awk commands from a file called excel.awk. Of course, excel.csv is the file we will process."))
main.append(addParagraph("When I run this, I get output that actually looks much better than the output David gets, maybe because I saved it on a Windows machine rather than a mac, but it is still not perfect. David's output is"))
main.append(addSyntax("one!!3"))
main.append(addParagraph("The output I get is"))
main.append(addImageWithCaption("./images/excel_csv.jpg", "The output I get when processing the csv file."))
main.append(addParagraph("Clearly, neither result is perfect and the difference is unimportant. What we want to think about is why we are not getting the results we expected. Let's start by taking a look at the file."))
main.append(addImageWithCaption("./images/excel_csv1.jpg", "The excel.csv file viewed in vim."))
main.append(addParagraph("I scrolled up in the image so that you can see the output from processing the file with awk as well as the file itself for comparison."))
main.append(addParagraph("You might recall that the original file had 6 lines and if we pipe the command we used to process it, you will see that we are actually producing 9 lines of output, so we have now seen several strange things. There is the difference in output that David and I get, the difference in the number of lines in the output compared to the original file we used as input, and a question you may have been asking yourself, what are those strange character pairs at the end of some of the lines in the csv file, the ^M, shown in blue."))
main.append(addParagraph("Notice that the number of lines we see in that file seems to be 9 which is the same as the number of lines in the output whereas the number of ^M pairs we see is the same as the actual number of lines in the file, 6, and if you compare their positions in the file to the original file, you will see that they do in fact appear at the end of each of the 6 lines. Actually, this is how an end of line is marked in a Microsoft csv file and in this case, since awk expects a standard linux type line end on my Raspberry Pi and the Apple equivalent on David's mac, this is the source of many of the 'errors' we see."))
main.append(addParagraph("If you save a file in Microsoft, it will often save that file using this type of carriage return character, control and M or ^M, to denote the end of a line whereas Unix like systems tend to use a line feed or control and J. One possible solution might be to use RS to specify Microsoft style line endings as our record separator and to do that we will add a BEGIN clause to our AWK program and use it to set the value of RS to \"\r\" which will mean that AWK should now recognise the line endings."))
main.append(addParagraph("Actually, I suspect that on my PC, it may have already done that, even though I am running this on a Raspberry Pi, so I suspect the result might be the same. The results are shown below"))
main.append(addImageWithCaption("./images/excel_csv2.jpg", "Running the program with RS set to \r."))
main.append(addParagraph("This has made some difference. The first line looks correct and we can see that if you use the exclamation marks as a guide, we do have, in some sense, the right number of lines in our output. That is, we should see 24 exclamation marks in total (4 per line) and we do. That wasn't true of our first run so that suggests there is something else in the file that is throwing off our results. Actually, again I am seeing results that are different to the results that David sees and this time, his output looks better."))
main.append(addParagraph("Depending on the data in your CSV file, you might find that setting the field separator to a comma and RS to \r will be sufficient to correctly parse the file. This would be the case if your data is made up of simple strings and numbers and doesn't include certain special characters and that would include, for obvious reasons, a comma."))
main.append(addParagraph("For example, the fifth row in the original Excel file is"))
main.append(addSyntax("one,one|two,two|three,three"))
main.append(addParagraph("For clarity , I used the pipe character to separate the fields so you can see that there is a comma inside each field value and this could be confusing when you want to use a comma to separate the fields."))
main.append(addParagraph("The best way to figure out how excel dealt with this is to use vim with the csv file we saved earlier. As a reminder, that was"))
main.append(addImageWithCaption("./images/excel_csv3.jpg", "Our file saved by Excel as a CSV file and viewed with vim."))
main.append(addParagraph("If we process this file, we are getting 16 lines of output, but notice that the fifth line includes quote marks that were not in the original file. This is how Excel handles fields that include a comma when it saves a file in the csv format. It puts the value of the field in double quotes, so one,one becomes \"one,one\" for example."))
main.append(addParagraph("Notice also that this essentially means that the double quote is a special character so if a field includes a double quote, the character is enclosed in double quotes so in the fourth line, for example, \"one\" becomes \"\"\"one\"\"\"."))
main.append(addParagraph("If you Look at the sixth line, the fields actually include a newline character and Excel doesn't seem to do anything special with that which is why we see, when we open the file with vim, that the corresponding line is spread over 4 lines but we only see the blue ^M at the end of the fourth line. In addition, as with the comma, the entire field is enclosed in double quotes."))
main.append(addParagraph("Dealing with these additional quote marks is actually pretty straightforward. We just need to make a couple of gsub calls for each line of input. The first call"))
main.append(addSyntax("gsub(\"^\\\"|\\\"$\", \"\", $i);"))
main.append(addParagraph("looks for a quote character at the start of the line or at the end of the line and replaces it with an empty string, effectively removing it. The second call"))
main.append(addSyntax("gsub(\"\\\"\\\", \"\\\"\", $i);"))
main.append(addParagraph("looks for a pair of double quotes within the line and replaces it with one double quote character"))
main.append(addParagraph("If we run this again, our results are still not perfect as you can see in the image below."))
main.append(addImageWithCaption("./images/excel_csv4.jpg", "The output we get after tidying up the quote marks in the input file."))
main.append(addParagraph("The field values do look as though they match the input file quite closely, but we still need to deal with commas and newline characters in the field values and we will look at that next."))
main.append(addParagraph("Before we get to that, it is worth noting that we have been looking at the problems associated with a csv file produced by Excel, but there are other tools that can also generate csv files and they may deal with these problems differently.  One approach is to simply put every field value in double quotes so other problems can be ignored since these now become part of a string value.  In this scenario, you can use the same gsub call we used to remove the quotes from the start and end of each cell and then tackle the other problem characters such as commas with in a field (or cell) value."))
main.append(addParagraph("The important point to take from that is that there is a general strategy for dealing with this kind of problem, for cleaning up your csv file in other words, so you should bear in mind that you won't always be able to use exactly the same solution.  Hopefully, seeing how the problems are dealt with here should help in developing a strategy in dealing with a file that has different problems, for example, if it uses a different end of line character."))
main.append(addParagraph("You probably began this chapter thinking a csv file was something AWK would be able to handle without too much trouble but in fact, csv files are not always as simple or as consistent as you would like them to be! "))
main.append(addHeader("Parsing Excel CSV Files: Commas and Newlines"))
main.append(addParagraph("The problem with embedded commas is a little more difficult because there is no easy way for AWK to tell which of the commas in the file are filed separators and which are a part of a field.  The best solution is actually to replace the problem with another almost identical problem.  In other words, rather than saving the file as a comma separated file, we can save it as a tab separated file.  Fortunately, Excel makes it quite difficult to embed tab characters within a field so while this solution is not guaranteed to be completely effective, it is much less likely that you will encounter the problem."))
main.append(addParagraph("Once we have the file saved (it has been saved as excel.txt), we can specify the tab rather than the comma as the field separator and otherwise run the program as we had before.  The following image shows this tab-delimited file and the output we get from running our excel.awk program on it."))
main.append(addImageWithCaption("./images/excel_txt.jpg", "The tab delimited file and the output we get when we run excel.awk on it."))
main.append(addParagraph("Before I mention the output, note that there was a slight difference to the way I ran this in comparison to the way David ran it.  For David, the command was"))
main.append(addSyntax("awk -Ft -f excel.awk excel.txt"))
main.append(addParagraph("I got weird results when I ran that so I changed the way in which the field separator was specified so the command became"))
main.append(addSyntax("awk -F'\t' -f excel.awk excel.txt"))
main.append(addParagraph("which gave me an output that was almost identical to David's although it is still different.  It looks as though the initial quote at the start of the line is not being removed for me."))
main.append(addParagraph("That aside, the output does more or less correctly display line 5 which is the line where the fields contain embedded commas, so that problem has been resolved."))
main.append(addParagraph("The problem of newline characters can be even more difficult to resolve but the best way seems to be as follows.  Remember that the csv file will have this field in double quotes."))
main.append(addParagraph("We will add a while loop to the program before the for loop and it looks like this."))
main.append(addInsetCodeListing(["while ( $NF ~ /^\".*[^\"]$/ ) {", "    getline x;", "    $0 = $0 \"\n\" x;", "}"]))
main.append(addParagraph("So this will read in the line and check to see if it matches the pattern where there is a double quote at the end of the field followed by a string of characters and terminating with another double quote.  If there is, that suggests that this is the whole line and awk then moves on to the for loop and then the next line.  If it doesn't, this implies that there was a newline character embedded with the double quotes that has caused part of the line to be moved down to the next line.  If that's the case, we call getline which gets the next line in the file and we add a newline character as well as the next line to our current line."))
main.append(addParagraph("Since we have assigned a new value to $0, AWK will reparse the line into fields so the while loop will run again and we will go through the same process until the line starts and ends with a double quote.  At that point, we can be confident that the line with its embedded spaces has been put back together again and matches the corresponding line in the csv file."))
main.append(addParagraph("Of course, depending on why you need to parse a file, you might decide to insert some other character into the line rather than a newline character.  For example, if you are generating HTML, you might want to replace it with a breaking space."))
main.append(addParagraph("As an alternative, there is a public domain csv parser written in AWK and this is included in the Exercise Files in a file called csv.awk.  You can also get a copy from <a href='http://lorance.freeshell.org/csv/'>Lorance.Freeshell.org</a> but using this may involve some work in preparing your data before parsing it.  It is also worth remembering that if you want to parse a csv file in order to clean it up so that you can process it with AWK, you don't necessarily have to use AWK to do that and you might actually prefer to clean it up in Excel before saving it as a csv file.  There is quite a lot you can do with AWK in this regard, but it is unlikely that AWK will ever be able to parse a csv so reliably that it will cover all possible problems you might have with the file but it is nevertheless a useful tool when you are working with csv files."))
main.append(addHeader("Scripting with AWK"))
main.append(addParagraph("There are a number of reasons why you may want some AWK code to be repeatable.  For example, you might have some data relating to the officers of a bowling club and you need to convert that data to HTML for inclusion in your web site.  You might be able to do that with a single line of AWK but if you want to be able to use the same code every time the data changes, it's probably worth putting into a bash script."))
main.append(addParagraph("AWK is essentially a bash command so there is nothing special about putting it into a script.  You would simply create your script as normal and insert your AWK command as needed.  Let's go ahead and do that and we will call our file makeeofficers.sh.  The file looks like this."))
main.append(addImageWithCaption("./images/makeofficers_sh.jpg", "The makeofficers.sh script."))
main.append(addParagraph("This is mostly straightforward and is split into three parts so we will look at each one in turn.  We start with the begin clause and you might think of this (at least in the context of this script) as the things to be done before we start processing the lines in a file.  We are specifying the tab character as our field separator and then printing out a couple of lines.  Note that we don't necessarily have to print these to the screens and in fact, we don't intend to do that.  These are an h1 element that will appear above the table and is essentially a title.  On the next line we print the opening ul tag."))
main.append(addParagraph("The next part of the code is where we process the lines from our file but only if the record (or line) number is greater than 1.  This is because the first line of the file contains the column headers and we don't want to include these in our table.  We then print out four things, the first and last of which are the opening and closing tags for each list item.  Each of these will contain the contact details of 1 officer so we then extract the relevant fields from the line.  These are $1, $2 and $3 and these are (as noted in the comment) the office, first and last names from each record.)"))
main.append(addParagraph("The next line is perhaps the most complex line of code in the script, but it is actually just a print statement with a mixture of string literals and a field, $12, which appears twice.  This is simply generating an anchor element in HTML and it uses $12 from the record - that is to say, the 12th field in each line - both to set the href attribute and the text to be displayed for the link.  Note that the link is to an email address so in theory, clicking the link should open an email client with an email where the to address has been set to the name of the office holder whose link you clicked."))
main.append(addParagraph("The final section of the code adds the closing ul tag and that ends the AWK.  The last line"))
main.append(addSyntax("}' $1 > officers.html"))
main.append(addParagraph("is interesting.  You might think that the $1 on this line is a reference to field 1 of the file in the same was as we reference that field when we are generating the list items, but that is not the case.  Notice that the AWK is enclosed in single quotes with the opening quote just before BEGIN and the closing quote after the curly brace on this line.  So in this case, $1 is not part of the AWK code but it is, of course, still part of the bash script and it is being used here in a bash script context and represents the first argument passed to the script.  For example, if we run the script, we would use a command like this - note that before we do, we would need to use chmod to make the script executable."))
main.append(addSyntax("./makeofficers.sh officers.txt"))
main.append(addParagraph("We could also replace $1 in the script with officers.txt which would mean we would be able to run the script without having to specify the file you want to process.  Of course, the disadvantage is that you can only process that file with your script unless you edit the script to process a new file."))
main.append(addParagraph("The last part is where we are redirecting the output to a file called officers.html.  This time, the filename is hard-coded into the script but it doesn't need to be.  We could omit this and specify the redirection when we run the command so it is the opposite approach to the one taken with the filename.  Every time the script is successfully executed, it will write the output to officers.html and this will overwrite any pre-existing contents.  If we omit it from the scrip[t, it means we have to type it out every tine we run the command and that introduces the possibility of making a spelling error or some other type of error."))
main.append(addParagraph("To sum up the last part, you can add both or either of the filenames (input and output) to the script or omit them and add them when running the script.  If you know they will always be the same or perhaps will change very occasionally, adding them to the script is probably a good idea since you only have to type the script once and you can run it later without knowing the name of the input file.  If the script is intended to be more general purpose and will have a variety of both input and output files, it's probably better to leave them out and expect the user running the script to provide them."))
main.append(addParagraph("Although David doesn't explicitly say this in the course video, there is a suggestion that the filename in this scenario maybe one that is provided to you and so you may not know in advance what that will be called.  That doesn't necessarily mean that you can't hard-code the filename into the script, but it would mean that you would have to rename the file before processing it  so it may be easier to omit it."))
main.append(addParagraph("One final point, if you look back on the makeoffices.sh script as shown above, the first line of the code, after the shebang line, is:"))
main.append(addSyntax("awk -Ft 'BEGIN {"))
main.append(addParagraph("This didn't work correctly for me and when I ran the script, I got some garbled output which looked like this."))
main.append(addImageWithCaption("./images/makeofficers_sh_broken.jpg", "The result I get when specifying the field separator as Ft."))
main.append(addParagraph("Although this looks like the text has become garbled to some degree. you might notice that the letter t (lower case) doesn't appear anywhere in the output and in most cases, we didn't get the email address in full.  For instance, the office in the first line should be President but is actually displayed as Presiden.  This is because AWK is using t as the field separator.  You can certainly see that it is not using spaces but if you compare this output to the original file, you will see that we are displaying everything up to the first t followed by everything up to the second t and if there are enough fields based on that separator, we also see everything after the 11th t (which is field 12) up to either the next t or the end of the line."))
main.append(addParagraph("To fix this, the line has been changed to"))
main.append(addSyntax("awk -F'\t' 'BEGIN {"))
main.append(addParagraph("which seems to be the way you need to specify the tab field separator on my Raspberry Pi but once that is done, the HTML does display as expected."))
main.append(addParagraph("Obviously, this course doesn't cover bash scripting, but if you are interested in that, there are several courses on LinkedIn Learning that cover that."))
main.append(addHeader("Challenge: Perform a Join"))
main.append(addParagraph("For the challenge, we have two data files as follows:"))
main.append(addSyntax("nameemailavg.csv"))
main.append(addParagraph("This file contains the names, email addresses and average score for 19 bowlers.  As you can see, this is a comma-separated file."))
main.append(addSyntax("addresses.txr"))
main.append(addParagraph("This is a file containing, amongst other things, names and addresses for a large number of people and this includes the 19 bowlers from the first file."))
main.append(addParagraph("The aim of the challenge is to use these two files to generate a new file containing the name, address and bowling average as an integer for each of those 19 bowlers.  The solution and the output from running it is shown below."))
main.append(addHeader("Solution: Perform a Join"))
main.append(addParagraph("This is David's solution."))
main.append(addImageWithCaption("./images/challenge.jpg", "The challenge solution and output."))
main.append(addParagraph("As before, we will go through this section by section starting with BEGIN where we set both the field and output field separators to tab."))
main.append(addParagraph("Since we will be processing two files, we need to have some code that executes for one file and some for the other.  The second section handles processing for the nameemailavg.csv file.  The first thing to note is that the first line looks a little bit like an assignment but it is using the comparison operator (==).  So this is a condition and it will process all the lines in a file provided the name of the file is nameemailavg.csv."))
main.append(addParagraph("We can't simply extract fields in the usual way because we have different field separators for the two files and we chose to use tabs as the filed separator.  Essentially, since there are no tabs in this file, it means that each line of the file is a single field.  We will split each line from the file with the command"))
main.append(addSyntax("split($0, a, \",\");"))
main.append(addParagraph("This uses its a comma but it's not really as a field separator.  Rather, it uses the comma to split one field into several using a comma to identify the breaks.  Objectively, this isn't any different from a field separator but it allows us to essentially get the different fields even where the specified field separator is something else and it creates an array to hold these fields.  The syntax of the command is fairly straightforward and it takes three arguments, the field, the name of the array to be created and the separator."))
main.append(addParagraph("One interesting point about that is we have specified $0 as the field because we want to split the whole line into different fields.  If you have a file where there is a field that contains several items of data separated by something other than your field separator, you could also use split on that specific field."))
main.append(addParagraph("The next line creates a variable called email and sets it to the value in a[2] which is the second element in the array we just created and as you might guess, it is the email address from the current line."))
main.append(addParagraph("The next line adds that email address to an array using the file record number (FNR) as an index."))
main.append(addParagraph("The next two lines do something similar but they create an associative array with email as the index.  This gives us an array called name where each name in the file is associated with the email address and a similar associative array for average.  To make this clearer, consider the first line of the file which is"))
main.append(addSyntax("Art Venere,art@venere.org,256.62394383"))
main.append(addParagraph("This creates an array, a, that looks like this."))
main.append(addSyntax("[\"Art Venere\", \"art@venere.org\", \"256.62394383\"]"))
main.append(addParagraph("The value of email is set to art@venere.org.  In the name array, this also gives us an element whose index is the same email address and whose value is the name so in this example, Art Venere.  Similarly, in the average, there is an element whose index is art@venere.org and the associated value is 256.62394383."))
main.append(addParagraph("In the next section, we are processing the addresses.txt file and we start by getting the value of field $11 which is the email address.  Quick sidenote in case you hadn't noticed, the email address is the only field that is common to both files which is why we are using it for indexing purposes.  As before, we will use that email address to create an associative array using the email address as the index.  In this case, however, the value isn't coming from a single field.  It is composed of several fields (4, 5, 7 and 8) and there are commas separating fields 4 and 5 and fields 5 and 6.  Again, let's look at a line from the input file, in this case that is addresses.txt."))
main.append(addSyntax("Art     Venere  Chemel, James L Cpa     8 W Cerritos Ave #54    Bridgeport      Gloucester      NJ      08014   856-636-8749    856-264-4130    art@venere.org  http://www.chemeljameslcpa.com"))
main.append(addParagraph("Not counting the line with the headers, this is the second line of the file but I chose it because it is the address for the person we looked at in our previous example, Art Venere.  Notice that the fields holding the different parts of Art's address are fields 4 to 8 so there are five fields.  These are address (the street address), city, county, state and zip.  You might notice that we are creating the address by putting 4 of these 5 fields together and we are leaving out the county so in this example, the address will be"))
main.append(addSyntax(" 8 W Cerritos Ave #54, Bridgeport, NJ 08014"))
main.append(addParagraph("In the END section, we are iterating over the array of email addresses and remember, this is made up of the 19 email addresses we got from nameemailavg.csv.  For each email, we wrote the corresponding value from the associative arrays and since these are using the email address as an index, this means that we are getting the name, address and average score associate with that email address and these become a line of output."))
main.append(addParagraph("A couple of things to note here, the address array contains all of the addresses from the addresses.txt file but we only get 19 lines of output because we are using the emails array to fetch those addresses so we are only getting the addresses for the nineteen people listed in nameemailavg.csv.  The name and average arrays were generated with that email addresses from that file so these both have 19 values with the 19 email addresses used to index them."))
main.append(addParagraph("In the average array, we have the precise average taken from nameemailavg.csv and we don't convert it to an integer until we generate each line of output in the end section and we do that by passing the value to the int function before adding it to the output line.  This is just a personal preference, but I would do the conversion when processing the nameemailavg.csv file so that the "))
main.append(addSyntax("average[email] = a[3];"))
main.append(addParagraph("becomes"))
main.append(addSyntax(" average[email] = int(a[3]);"))
main.append(addParagraph("This is possibly more efficient because the array stores integers rather than the full floating point number.  That's probably not going to make too much difference but I think that it's better to get that out of the way when we first encounter it, especially because this is on a line of its own whereas in the END section, we are creating the output line with a single line of code and it looks better, to me, if we don't do the conversion there because that adds a weird looking inconsistency to the code.  On the other hand, it does make it a little clearer because we are doing that conversion for output purposes and if we are doing something else with these arrays, for example if we wanted to calculate an average, it might be better to have more precise values in the array.  So this is a personal preference but in terms of which you use, you also need to take into account exactly what you will be doing with the data."))
main.append(addParagraph("As you may have noticed at the start of this section, we run the challenge script with a command like"))
main.append(addSyntax("awk -f challenge.awk nameemailavg.csv addresses.txt"))
main.append(addParagraph("In the script, we process the nameemailavg.csv file first and then the addresses.txt file, so it makes sense to specifiy them in that order when executing the command, but in fact the order really doesn't matter.  We just need to pass both files to the script and it will still process them in the same order (although, even if it didn't, that wouldn't matter because we are not really interested in matching up the corresponding fields until we execute the code in the END section)."))

addSidebar("linux");