import { addBanner, addArticle, addTitle, addHeader, addParagraph, addSubHeader } from '/scripts/article.js'; import { addInset, addInsetList, addInsetCodeListing, addInsetBulletList } from '/scripts/inset.js'; import { addImageWithCaption, addButtonGroup } from '/scripts/visuals.js'; import { addSidebar} from '/scripts/sidebar.js'; import { addSyntax } from '/scripts/code.js'; import { menu } from '/scripts/web_dev_buttons.js'; import { global_menu } from '/scripts/grid_layout1.js'; import { local_menu } from '/scripts/linux.js'; const heading = document.querySelector(".heading"); const global = document.querySelector(".global_menu"); const local = document.querySelector(".local_menu"); const sidebar = document.querySelector(".sidebar"); const main = document.querySelector(".main_content"); heading.append(addTitle("AWK Essential Training")); heading.append(addParagraph("David D. Levine - LinkedIn Learning - November 2022")); heading.append(addParagraph("Chapter 7 - Functions and Arrays")); main.append(addHeader("MANIPULATING A STRING")) main.append(addParagraph("AWK provides a number of methods for string manipulation. Before we get in to those, it is important to remember that in AWK, string positions start at 1 rather than O. In general, if you refer to the character in position O of a string, you are referring to that whole string.")) main.append(addParagraph("Functions that you can use with strings include the following.")) main.append(addSubHeader("length ([string]) ")) main.append(addParagraph("This returns the length of a string.")) main.append(addSubHeader("length() ")) main.append(addParagraph("This returns the length of the whole line.")) main.append(addSubHeader("index(string,target) ")) main.append(addParagraph("This returns the position of the first instance of the specified string or 0 if it is not found.")) main.append(addSubHeader("match(string,index) ")) main.append(addParagraph("This is the same as index but looks for a regex rather than a string.")) main.append(addSubHeader("substr(string,start[,length])")) main.append(addParagraph("This will return the string that starts at the position given with start and has the specified length. If no length is specified, it will return everything up to the end of the string.")) main.append(addParagraph("Note that with the match() function, as well as returning the matched string, it also sets the values for a couple of AWK variables. These are")) main.append(addSubHeader("RSTART")) main.append(addParagraph("he starting position of the matched string")) main.append(addSubHeader("RLENGTH")) main.append(addParagraph("The length of the matched string")) main.append(addParagraph("For example ")) main.append(addSyntax("match(\"antidisestablishmentarianism\",/b[a-Z]*n/)")) main.append(addParagraph("will return 12, the position at which a matching string was found . It also sets the values for RSTART and RLENGTH as follows")) main.append(addSyntax("RSTART = 12")) main.append(addParagraph("and")) main.append(addSyntax("RLENGTH = 14")) main.append(addParagraph("Remember that the regex is greedy so it will match with the longest string possible. The file, removethe.awk demonstrates some of these functions.")) main.append(addImageWithCaption("./images/removethe.jpg", "The removethe.awk program.")) main.append(addParagraph("This is using three of the functions together in order to locate and remove the string, the, from each line it is found in.")) main.append(addParagraph("It starts by creating a variable to hold the string we are searching for. It then uses index to locate the string in the line.")) main.append(addParagraph("The next line sets the value of s to the position at which that string is found.")) main.append(addParagraph("We then have some code inside a conditional, the condition being that s--0. Since s represents the index value of the first character in the string if 'the' is found in the line, if it is equal to zero, that means that the string wasn't found.")) main.append(addParagraph("If the condition is true, it just outputs the whole line. If it is false (meaning that the string was found), two substrings of the line are output.")) main.append(addSyntax("substr($0, 1,s-1 )")) main.append(addParagraph("Remember that s is the index position of the string in the line so s-1 is the index position of the character immediately before it so this will print everything in the line before 'the'.")) main.append(addParagraph("The second substring is")) main.append(addSyntax("substr($0,s+length(target))")) main.append(addParagraph("This outputs a string starting at a position given by the expression (s+length(target)) which is the position of the first occurence of the string 'the'+3 (the length of 'the') which means that this substring is everything in the line after the first instance of 'the'.")) main.append(addParagraph("So essentially we are outputting everything before the first instance of 'the' and everything after it which effectively removes it from the line! The result of running the program using dukeofyork.txt is shown below.")) main.append(addImageWithCaption("./images/removethe_output.jpg", "The result we get from running removethe. awkagainst dukeofyork.txt.")) main.append(addParagraph("Notice that we are not looking for the word the, we are looking for the string the and you will see this, for example, in the third line where the word 'them' becomes just 'm'. You will also see, again in the third line where the word 'the' appears after 'them' but has not been removed because we are only looking for and removing the first instance of 'the', whether as a whole word or a part of a word.")) main.append(addParagraph("So that's how we can use AWK to remove part of a line but there are also other functions that are better for this.")) main.append(addSubHeader("sub(regexp,newval[,string])")) main.append(addParagraph("The sub function takes the optional string value if it is provided, it looks for a match with the regex and if found, it replaces it with newval.")) main.append(addParagraph("So that gives us a simple way to remove the string 'the' from each line of our file.")) main.append(addParagraph("Note that the sub function can modify a string so you can't use it with a string literal or an expression that rotums a string literal. You can use a field, a variable or on array element, or you can omit it, in which case $0 is used.")) main.append(addSyntax("$awk '{sub(/the/,\"\");print}' dukeofyork.txt")) main.append(addParagraph("Here is another example that looks for either 'they' or 'them' in a string and replaces it with 'ALL OF THEM'. The result is shown below.")) main.append(addSyntax("$awk'{sub(/the[ym]/, \"ALL OF THEM\"); print}' dukeofyork.txt")) main.append(addImageWithCaption("./images/removethe_output2.jpg", "Demonstrating the sub function.")) main.append(addSubHeader("gsub(regexp,newval[,string])")) main.append(addParagraph("The gsub function does exactly the same as the sub function, but operates globally, hence the g. In this context, that means it will replace all instances of the pattern it matches rather than just the first. If we re-run the previous command, replacing sub with gsub, we get the output shown below.")) main.append(addImageWithCaption("./images/removethe_output2.jpg", "emonstrating the gsub function.")) main.append(addSubHeader("split (string,array[regexp])")) main.append(addParagraph("The split function will take a string and much like awk itself does, will split the string into fields and sub-fields. It then places those fields or sub-fields into an array. It uses regexp as the separator.")) main.append(addParagraph("To demonstrate this, we will use the file nameemailarg.csv and to quickly recap, each line in the file contains the name of a bowler in the format first_name last_name, the bowler's email address and average score. So that is three fields separated by commas.")) main.append(addParagraph("As we have already seen, it is easy enough to access the individual fields, but sometimes, we might also want to access parts of a field. For example, we might want to update the file to store the names in the format last_name first_name. We can do that with the command")) main.append(addSyntax("awk - F, 'BEGIN{OFS=\"\t\"} {split $1, a ,//); print a[2],\",\" a[1], $2, $3}' nameemailarg.csv")) main.append(addParagraph("The code specifies a comma as the field separator for the imput and a tab as the output field separator.")) main.append(addParagraph("The split command takes the first field, which in our example is the bowler's name, specifies a as the array to store the results in and space as the separator which will result in the first name being stored in a[1] and the last name being stared in a[2].")) main.append(addParagraph("The print statement outputs the array elements in the order a[2] followed by a[1] with a comma between them and then the second and third fields with the fields being tab-separated.")) main.append(addParagraph("The result of running this is shown below.")) main.append(addImageWithCaption("./images/split.jpg", "Demonstrating the split function.")) main.append(addHeader("USING ASSOCIATIVE ARRAYS")) main.append(addParagraph("As we have seen, arrays in AWK are referenced by number starting with 1. However, AWK also provides associative arrays so you can create elements that use a string as the index value. For example, if we have an array, a, and we went to declare the first element with the reference \"first\" , we would do it like this.")) main.append(addSyntax("a[\"first\"=$1]")) main.append(addParagraph("This takes the first field and sets the value a[\"first\"] with it.")) main.append(addParagraph("If we want to access the element, we can do it like this.")) main.append(addSyntax("a[\"first\"]")) main.append(addParagraph("Let's see that in an example where we create an array with the first three fields and then output them in reverse order.")) main.append(addSyntax("awk '{a[\"first\"]=$1; a[\"second\"]=$2; a[\"third\"]=$3; print a[\"third\"], a[\"second\"], a[\"first\"]}'")) main.append(addParagraph("If we then provide the input")) main.append(addSyntax("one two three")) main.append(addParagraph("we get the output")) main.append(addSyntax("three two one")) main.append(addParagraph("You can use a for-loop with an associative array but not using an integer counter as we have previously. Instead, we use a for-in loop which will return each index value in an associative array which you can then use to access the associated value.")) main.append(addParagraph("The image below shows the same command we used to set up the associative array, a, but this time we use a for-in loop to output the index values along with their associated values.")) main.append(addImageWithCaption("./images/associative_array1.jpg", "Using a for-in loop to output the contents of an associative array.")) main.append(addParagraph("We used the same input but notice that the output is in a different order to what you might expect. We didn't specify any order in fact and associative arrays, when you iterate through them with a for-in loop, don't guarantee the order in which the elements will be returned.")) main.append(addParagraph("To demonstrate this, I also ran the same code on my dev machine and as you can see in the image below, the order is different.")) main.append(addImageWithCaption("./images/associative_array2.jpg", "Running the same for-in loop on my dev machine.")) main.append(addParagraph("Let's look at a more complex and potentially more useful example that takes some input and return's the number of instances of each word using an associative array.")) main.append(addParagraph("It uses two loops, the first goes through each word in the input file and uses it as the index value in the associative array called words. To make it effectively case-insensitive, the word is converted to all lower-case. The code for this is")) main.append(addParagraph("For instance, if we use dukeofyork.txt as the input, the first value is 'The'and the tolower function converts this to 'the'. Then the value of the array element with the index value 'the' is incremented. If it doesn't already exist it is created with an initial value of 0, so this is set to 1 when it is incremented.")) main.append(addParagraph("The second loop outputs each index/value pair. The image below shows the result of running this code with dukeofyork.txt as the input.")) main.append(addImageWithCaption("./images/associative_array3.jpg", "The result of running our word count program against dukeofyork.txt.")) main.append(addParagraph("As you can see the output is pretty random, but we can pipe it to sort which will output the words in alphenberic order. If we run it as follows")) main.append(addSyntax("awk -f wordusage.awk dukedyork.txt | sort -rn -k 2")) main.append(addParagraph("this sorts the output in reverse numeric order and the -k 2 option indicates that this should be sorted based on the value in the second field of the output.")) main.append(addParagraph("For our final example, you might recall that awk only supports one dimensional arrays. However, it is possible to emulate multi-dimensional arrays.")) main.append(addParagraph("For instance, in the following example, we use a string to record one or more dimensions. Actually, this isn't really anything new. Most of the examples we have seen so far use data in a two-dimensional array. The file, dukeofyork.txt contains 8 lines, each of which contains a number of words. If we read in each line and store it in an array, each element of the array would hold several records effectively giving us a 2-dimensional array.")) main.append(addParagraph("The following image shows both the program, transpose.awk and the result of running this using dukeofyork.txt as the input file.")) main.append(addImageWithCaption("./images/transpose.jpg", "The transpose.awk program and the result of running it with dukeofyork.txt.")) main.append(addParagraph("This is putting the fields for each record into an array and then outputting it printing field 1 for each record and then field 2 for each record on the next line and so on until each line from the input has been displayed in a column in the output.")) main.append(addParagraph("This is essentially a hack to make it appear as though AWK supports 2-dimensional arrays but there are other ways to do that. For example, we could assume each line has 10 fields which means that we can accomodate any line from dukeofyork.txt. We could then use a for loop that runs 10 times for each line (so for this file, the loop would be inside a loop that executes 8 times).")) main.append(addParagraph("Essentially, this allows us to go through each line and put it into 10 consecutive elements of the array so each block of 10 elements contains a line, the first 10 elements contain the first line, the next 10 contain the second line and so on. You can then use integer division and the modulo operator on any element of the array to work out which field of which line any given element of the array represents.")) main.append(addHeader("INTRODUCING AWK's MATH FUNCTIONS")) main.append(addParagraph("As we have already seen, AWK is primarily intended to be used as a text-manipulation tool, but it does also provide some mathematical functions. ")) main.append(addSubHeader("int ( X )")) main.append(addParagraph("This returns the int value of x.")) main.append(addSubHeader("rand()")) main.append(addParagraph("This returns a random value between 0 and 1 (but not including either 0 or 1. You can use it to generate a random integer between 1 and n by multiplying the random value by n and adding 1.")) main.append(addParagraph("For example, you could simulate the roll of a 6-sided die with")) main.append(addSyntax("int (rand()*6)+1")) main.append(addSubHeader("srand(x)")) main.append(addParagraph("This seeds the rand function using x or if you omit it, using a value derived from the current date and time.")) main.append(addSubHeader("sqrt(x)")) main.append(addParagraph("This returns the square root of x.")) main.append(addSubHeader("sin(x)")) main.append(addParagraph("This returns the sine value of x in radians.")) main.append(addSubHeader("cos()")) main.append(addParagraph("This returns the cosine value of x in radians")) main.append(addSubHeader("atan2(y,x)")) main.append(addParagraph("This returns the arctangent of y over x in radians.")) main.append(addParagraph("For example this will give you the value of pi.")) main.append(addSyntax("atan2(0,-1)")) main.append(addSubHeader("log(x)")) main.append(addParagraph("This returns the natural log of x.")) main.append(addSubHeader("exp(x)")) main.append(addParagraph("This returns the exponent of some value to the power 2. For example, 5 exp(2) would return the value of 5 squared.")) addSidebar("linux");