import { addBanner, addArticle, addTitle, addHeader, addParagraph, addSubHeader } from '/scripts/article.js'; import { addInset, addInsetList, addInsetCodeListing, addInsetBulletList } from '/scripts/inset.js'; import { addImageWithCaption, addButtonGroup } from '/scripts/visuals.js'; import { addSidebar} from '/scripts/sidebar.js'; import { addSyntax } from '/scripts/code.js'; import { menu } from '/scripts/web_dev_buttons.js'; import { global_menu } from '/scripts/grid_layout1.js'; import { local_menu } from '/scripts/linux.js'; const heading = document.querySelector(".heading"); const global = document.querySelector(".global_menu"); const local = document.querySelector(".local_menu"); const sidebar = document.querySelector(".sidebar"); const main = document.querySelector(".main_content"); heading.append(addTitle("AWK Essential Training")); heading.append(addParagraph("David D. Levine - LinkedIn Learning - November 2022")); heading.append(addParagraph("Chapter 8 - Combining AWK with Other Tools")); main.append(addHeader("COMBINING AWK WITH OTHER TOOLS USING PIPES")) main.append(addParagraph("In the section on Functions and Arrays, we saw an example where the output of an awk command is piped to the sort command so that we can sort the output of the wordusage.awk program. A pipe is also commonly used to pipe the output of a command that produces too much output to read in the screen to a program that will paginate the output such as less or more.")) main.append(addParagraph("We can also pipe the output from other commands into awk. As an example, a command like ls -l (or ll which is a common alias for ls - l) produces lines of output in a format that awk can interpret. Consider the following example.")) main.append(addSyntax("ls -l | awk '/\.txt$)/{total+=$5; print} END {print total}'")) main.append(addParagraph("This takes the output of ls - 1 and pipes into to an AWK command. The command itself isn't too important in this context, but it would be good practice to break this down.")) main.append(addParagraph("Firstly , we have a pattern. This will act as a filter which means that we will ignore any line coming from the list command if it doesn't match the pattern. The pattern is fairly simple, this is just a string literal, .txt. Note that the dot is escaped because we only want to match with .txt, If we omitted the backslash ( so the hot isn't being escaped ) it would match with any character so rather than only .txt, it would also match with atxt, ?txt, 3txt and so on. Of course, it would also match with .txt, the important point is that the results would almost always be the same either way but escaping it means that any line that matches it will definitely include the string .txt. Not escaping it means that the line definitely includes the string txt and that there is at least one character before it.")) main.append(addParagraph("The $ anchors the string at the end of the line. In this context, the letters txt would normally appear as a file extension so the dot will probably be there and it will likely be at the end of the line. The point here is that you could probably be a little bit sloppy with the pattern and find that it gives you an accurate result most of the, but will sometimes throw up an inaccurate result so it is important to know exactly what you are trying to match with.")) main.append(addParagraph("The rest of the awk code takes the value of the fifth field, which its the file size and adds it to the total. When all the lines have been parsed, that total is then printed out so this code takes all of the files in a directory (it will also print the line if it matches) and gives us a total size for all the files.")) main.append(addParagraph("The result of running this in a directory containing the Exercise files for this course is shown below.")) main.append(addImageWithCaption("./images/pipe1.jpg", "Piping the output from a directory listing to awk for processing .")) main.append(addParagraph("Another tool commonly used with AWK is SED. SED makes it easier for us to change the text we are parsing and actually is complex enough to have a course dedicated to it which you can find at Lorance.Freeshell.org but using this may involve some work in preparing your data before parsing it. It is also worth remembering that if you want to parse a csv file in order to clean it up so that you can process it with AWK, you don't necessarily have to use AWK to do that and you might actually prefer to clean it up in Excel before saving it as a csv file. There is quite a lot you can do with AWK in this regard, but it is unlikely that AWK will ever be able to parse a csv so reliably that it will cover all possible problems you might have with the file but it is nevertheless a useful tool when you are working with csv files.")) main.append(addHeader("Scripting with AWK")) main.append(addParagraph("There are a number of reasons why you may want some AWK code to be repeatable. For example, you might have some data relating to the officers of a bowling club and you need to convert that data to HTML for inclusion in your web site. You might be able to do that with a single line of AWK but if you want to be able to use the same code every time the data changes, it's probably worth putting into a bash script.")) main.append(addParagraph("AWK is essentially a bash command so there is nothing special about putting it into a script. You would simply create your script as normal and insert your AWK command as needed. Let's go ahead and do that and we will call our file makeeofficers.sh. The file looks like this.")) main.append(addImageWithCaption("./images/makeofficers_sh.jpg", "The makeofficers.sh script.")) main.append(addParagraph("This is mostly straightforward and is split into three parts so we will look at each one in turn. We start with the begin clause and you might think of this (at least in the context of this script) as the things to be done before we start processing the lines in a file. We are specifying the tab character as our field separator and then printing out a couple of lines. Note that we don't necessarily have to print these to the screens and in fact, we don't intend to do that. These are an h1 element that will appear above the table and is essentially a title. On the next line we print the opening ul tag.")) main.append(addParagraph("The next part of the code is where we process the lines from our file but only if the record (or line) number is greater than 1. This is because the first line of the file contains the column headers and we don't want to include these in our table. We then print out four things, the first and last of which are the opening and closing tags for each list item. Each of these will contain the contact details of 1 officer so we then extract the relevant fields from the line. These are $1, $2 and $3 and these are (as noted in the comment) the office, first and last names from each record.)")) main.append(addParagraph("The next line is perhaps the most complex line of code in the script, but it is actually just a print statement with a mixture of string literals and a field, $12, which appears twice. This is simply generating an anchor element in HTML and it uses $12 from the record - that is to say, the 12th field in each line - both to set the href attribute and the text to be displayed for the link. Note that the link is to an email address so in theory, clicking the link should open an email client with an email where the to address has been set to the name of the office holder whose link you clicked.")) main.append(addParagraph("The final section of the code adds the closing ul tag and that ends the AWK. The last line")) main.append(addSyntax("}' $1 > officers.html")) main.append(addParagraph("is interesting. You might think that the $1 on this line is a reference to field 1 of the file in the same was as we reference that field when we are generating the list items, but that is not the case. Notice that the AWK is enclosed in single quotes with the opening quote just before BEGIN and the closing quote after the curly brace on this line. So in this case, $1 is not part of the AWK code but it is, of course, still part of the bash script and it is being used here in a bash script context and represents the first argument passed to the script. For example, if we run the script, we would use a command like this - note that before we do, we would need to use chmod to make the script executable.")) main.append(addSyntax("./makeofficers.sh officers.txt")) main.append(addParagraph("We could also replace $1 in the script with officers.txt which would mean we would be able to run the script without having to specify the file you want to process. Of course, the disadvantage is that you can only process that file with your script unless you edit the script to process a new file.")) main.append(addParagraph("The last part is where we are redirecting the output to a file called officers.html. This time, the filename is hard-coded into the script but it doesn't need to be. We could omit this and specify the redirection when we run the command so it is the opposite approach to the one taken with the filename. Every time the script is successfully executed, it will write the output to officers.html and this will overwrite any pre-existing contents. If we omit it from the scrip[t, it means we have to type it out every tine we run the command and that introduces the possibility of making a spelling error or some other type of error.")) main.append(addParagraph("To sum up the last part, you can add both or either of the filenames (input and output) to the script or omit them and add them when running the script. If you know they will always be the same or perhaps will change very occasionally, adding them to the script is probably a good idea since you only have to type the script once and you can run it later without knowing the name of the input file. If the script is intended to be more general purpose and will have a variety of both input and output files, it's probably better to leave them out and expect the user running the script to provide them.")) main.append(addParagraph("Although David doesn't explicitly say this in the course video, there is a suggestion that the filename in this scenario maybe one that is provided to you and so you may not know in advance what that will be called. That doesn't necessarily mean that you can't hard-code the filename into the script, but it would mean that you would have to rename the file before processing it so it may be easier to omit it.")) main.append(addParagraph("One final point, if you look back on the makeoffices.sh script as shown above, the first line of the code, after the shebang line, is:")) main.append(addSyntax("awk -Ft 'BEGIN {")) main.append(addParagraph("This didn't work correctly for me and when I ran the script, I got some garbled output which looked like this.")) main.append(addImageWithCaption("./images/makeofficers_sh_broken.jpg", "The result I get when specifying the field separator as Ft.")) main.append(addParagraph("Although this looks like the text has become garbled to some degree. you might notice that the letter t (lower case) doesn't appear anywhere in the output and in most cases, we didn't get the email address in full. For instance, the office in the first line should be President but is actually displayed as Presiden. This is because AWK is using t as the field separator. You can certainly see that it is not using spaces but if you compare this output to the original file, you will see that we are displaying everything up to the first t followed by everything up to the second t and if there are enough fields based on that separator, we also see everything after the 11th t (which is field 12) up to either the next t or the end of the line.")) main.append(addParagraph("To fix this, the line has been changed to")) main.append(addSyntax("awk -F'\t' 'BEGIN {")) main.append(addParagraph("which seems to be the way you need to specify the tab field separator on my Raspberry Pi but once that is done, the HTML does display as expected.")) main.append(addParagraph("Obviously, this course doesn't cover bash scripting, but if you are interested in that, there are several courses on LinkedIn Learning that cover that.")) main.append(addHeader("Challenge: Perform a Join")) main.append(addParagraph("For the challenge, we have two data files as follows:")) main.append(addSyntax("nameemailavg.csv")) main.append(addParagraph("This file contains the names, email addresses and average score for 19 bowlers. As you can see, this is a comma-separated file.")) main.append(addSyntax("addresses.txr")) main.append(addParagraph("This is a file containing, amongst other things, names and addresses for a large number of people and this includes the 19 bowlers from the first file.")) main.append(addParagraph("The aim of the challenge is to use these two files to generate a new file containing the name, address and bowling average as an integer for each of those 19 bowlers. The solution and the output from running it is shown below.")) main.append(addHeader("Solution: Perform a Join")) main.append(addParagraph("This is David's solution.")) main.append(addImageWithCaption("./images/challenge.jpg", "The challenge solution and output.")) main.append(addParagraph("As before, we will go through this section by section starting with BEGIN where we set both the field and output field separators to tab.")) main.append(addParagraph("Since we will be processing two files, we need to have some code that executes for one file and some for the other. The second section handles processing for the nameemailavg.csv file. The first thing to note is that the first line looks a little bit like an assignment but it is using the comparison operator (==). So this is a condition and it will process all the lines in a file provided the name of the file is nameemailavg.csv.")) main.append(addParagraph("We can't simply extract fields in the usual way because we have different field separators for the two files and we chose to use tabs as the filed separator. Essentially, since there are no tabs in this file, it means that each line of the file is a single field. We will split each line from the file with the command")) main.append(addSyntax("split($0, a, \",\");")) main.append(addParagraph("This uses its a comma but it's not really as a field separator. Rather, it uses the comma to split one field into several using a comma to identify the breaks. Objectively, this isn't any different from a field separator but it allows us to essentially get the different fields even where the specified field separator is something else and it creates an array to hold these fields. The syntax of the command is fairly straightforward and it takes three arguments, the field, the name of the array to be created and the separator.")) main.append(addParagraph("One interesting point about that is we have specified $0 as the field because we want to split the whole line into different fields. If you have a file where there is a field that contains several items of data separated by something other than your field separator, you could also use split on that specific field.")) main.append(addParagraph("The next line creates a variable called email and sets it to the value in a[2] which is the second element in the array we just created and as you might guess, it is the email address from the current line.")) main.append(addParagraph("The next line adds that email address to an array using the file record number (FNR) as an index.")) main.append(addParagraph("The next two lines do something similar but they create an associative array with email as the index. This gives us an array called name where each name in the file is associated with the email address and a similar associative array for average. To make this clearer, consider the first line of the file which is")) main.append(addSyntax("Art Venere,art@venere.org,256.62394383")) main.append(addParagraph("This creates an array, a, that looks like this.")) main.append(addSyntax("[\"Art Venere\", \"art@venere.org\", \"256.62394383\"]")) main.append(addParagraph("The value of email is set to art@venere.org. In the name array, this also gives us an element whose index is the same email address and whose value is the name so in this example, Art Venere. Similarly, in the average, there is an element whose index is art@venere.org and the associated value is 256.62394383.")) main.append(addParagraph("In the next section, we are processing the addresses.txt file and we start by getting the value of field $11 which is the email address. Quick sidenote in case you hadn't noticed, the email address is the only field that is common to both files which is why we are using it for indexing purposes. As before, we will use that email address to create an associative array using the email address as the index. In this case, however, the value isn't coming from a single field. It is composed of several fields (4, 5, 7 and 8) and there are commas separating fields 4 and 5 and fields 5 and 6. Again, let's look at a line from the input file, in this case that is addresses.txt.")) main.append(addSyntax("Art Venere Chemel, James L Cpa 8 W Cerritos Ave #54 Bridgeport Gloucester NJ 08014 856-636-8749 856-264-4130 art@venere.org http://www.chemeljameslcpa.com")) main.append(addParagraph("Not counting the line with the headers, this is the second line of the file but I chose it because it is the address for the person we looked at in our previous example, Art Venere. Notice that the fields holding the different parts of Art's address are fields 4 to 8 so there are five fields. These are address (the street address), city, county, state and zip. You might notice that we are creating the address by putting 4 of these 5 fields together and we are leaving out the county so in this example, the address will be")) main.append(addSyntax(" 8 W Cerritos Ave #54, Bridgeport, NJ 08014")) main.append(addParagraph("In the END section, we are iterating over the array of email addresses and remember, this is made up of the 19 email addresses we got from nameemailavg.csv. For each email, we wrote the corresponding value from the associative arrays and since these are using the email address as an index, this means that we are getting the name, address and average score associate with that email address and these become a line of output.")) main.append(addParagraph("A couple of things to note here, the address array contains all of the addresses from the addresses.txt file but we only get 19 lines of output because we are using the emails array to fetch those addresses so we are only getting the addresses for the nineteen people listed in nameemailavg.csv. The name and average arrays were generated with that email addresses from that file so these both have 19 values with the 19 email addresses used to index them.")) main.append(addParagraph("In the average array, we have the precise average taken from nameemailavg.csv and we don't convert it to an integer until we generate each line of output in the end section and we do that by passing the value to the int function before adding it to the output line. This is just a personal preference, but I would do the conversion when processing the nameemailavg.csv file so that the ")) main.append(addSyntax("average[email] = a[3];")) main.append(addParagraph("becomes")) main.append(addSyntax(" average[email] = int(a[3]);")) main.append(addParagraph("This is possibly more efficient because the array stores integers rather than the full floating point number. That's probably not going to make too much difference but I think that it's better to get that out of the way when we first encounter it, especially because this is on a line of its own whereas in the END section, we are creating the output line with a single line of code and it looks better, to me, if we don't do the conversion there because that adds a weird looking inconsistency to the code. On the other hand, it does make it a little clearer because we are doing that conversion for output purposes and if we are doing something else with these arrays, for example if we wanted to calculate an average, it might be better to have more precise values in the array. So this is a personal preference but in terms of which you use, you also need to take into account exactly what you will be doing with the data.")) main.append(addParagraph("As you may have noticed at the start of this section, we run the challenge script with a command like")) main.append(addSyntax("awk -f challenge.awk nameemailavg.csv addresses.txt")) main.append(addParagraph("In the script, we process the nameemailavg.csv file first and then the addresses.txt file, so it makes sense to specifiy them in that order when executing the command, but in fact the order really doesn't matter. We just need to pass both files to the script and it will still process them in the same order (although, even if it didn't, that wouldn't matter because we are not really interested in matching up the corresponding fields until we execute the code in the END section).")) addSidebar("linux");