use warnings; use strict; use WWW::Mechanize; use JSON; use Data::Dumper; ##################################################################################### ##################################################################################### ## ## Query ACS 5yr Data Tables ## ## This module will interact with the Beta Census API to put together a user- ## requested table for a user-requested geography. This is a limited program ## which was developed primarily as a learning exercise. Comments throughout ## provide instructive examples on how to use the API (at least in perl). ## ## Developer: Ali Mushtaq ## ##################################################################################### ##################################################################################### ############################################## # User-provided query details ############################################## # Request your own Census Key from "www.census.gov/developers" my $censusAPIkey = ""; # Point to the API -- here, we point to 2017 ACS 5yr my $apibaseURL = "http://api.census.gov/data/2017/acs/acs5"; # Query input, provide desired table and geography # Examples below show how geography requests can be formatted my $table = "B01001"; # Sex by Age my $geo = "state:*"; # All states #my $geo = "state:1,2,4,5"; # Specific States #my $geo = "county:*&in=state:41"; # All counties in State 41 ############################################## # Find information needed to perform http GET # requests. # # The API splits the tables up by column. To # construct a complete table, all columns must # be requested in the http GET statement. Also, # in order to determine all the individual json # files to request, will need to reference the # "variables.json" file in this API to find # all columns. # # This is a tedious structure of the API. # An application can limit the requested # columns (for example, exclude variances # unless requested), but the possible # columns still has to be mined from the # variables.json file. A practical application # would probably hold all data from # variables.json internally. The file is large, # and almost one-third of the file is redundant. # And documentation is limited. # # When the variables.json file is decoded, it is # put into %json. The structure of this particular # json file makes %json a hash of hashes of # hashes. The first level is fairly meaningless: # there's one entry: "variables". The next level # down contain the table names. The third and # last level contains the table and column # names ("concept" and "label", respectively). # The table name is repeated for each column, # making variables.json very redundant. # # To look at a specific column name for a given # table (B01001_001E) the correct dereference is: # # $json{"variables"}{"B01001_001E"}{"label"} # ^ ^ ^ ^ # Useless -' | | | # | | | # Table name -----------' | | # | | # Column identifier ---------' | # | # "label"=column or "concept"=table -' # # Below, for simplicity, the hashes are dereferenced # one level at a time: # # %jsontbl = %{$json{"variables"}} # %jsonvar = %{$jsontbl{"B01001_001E"}} # $jsonvar{"label"} = Column Name # ############################################## print "\nFinding column names and labels for $table ... \n"; # Initialize my $mech = new WWW::Mechanize(autocheck => 1); # Request and read the variables.json file my $apiURL = "$apibaseURL/variables.json"; $mech->get($apiURL); my $json = decode_json($mech->text); # Extract labels and names for $table my $tblname = " "; my %columns = (); my %jsontbl = %{${$json}{"variables"}}; foreach my $tbl (keys %jsontbl){ if($tbl =~ /${table}_/){ my %jsonvar = %{$jsontbl{$tbl}}; $columns{$tbl} = $jsonvar{"label"}; if($tblname eq " "){$tblname=$jsonvar{"concept"};} } } print "Reading data for $tblname ... \n"; ############################################## # Construct the API http GET request, read the # json file for the request and put the data # in an array @data. Each request is limited # to 50 variables. So we need to split and # recombine the request. The array merging # below is a bit clumsy as a result of having # to lengthen the second dimension of the # array as defined by the API. # # When the resulting json file is decoded, it is # put into @data. The structure of this particular # json file makes @data an array of arrays. The # first dimension is the table columns and the # second is the geography. The indices refer to # the order of the request for both dimensions. # This has been kept in %columns for the first # dimension. Geography decoding requires linking # to a geography data dictionary. # # To look at a specific data value, the correct # dereference is: # # $data[$column][$row] # ^ ^ # | '--Varnames are row 0, data # | starts at row 1 and the # | number of rows is determined # | by the requested geography. # | # '--The last column(s) are the geography # variables. The other columns are # the individual table column requests # in order. # ############################################## my @tbllist = sort keys %columns; my $tblcnt = 0; my @data = (); my $numreq = 0; $apiURL = "${apibaseURL}?get="; for(my $i=0; $i<=$#tbllist; $i++){ $apiURL = $apiURL . $tbllist[$i]; $tblcnt++; if($tblcnt==50 or $i==$#tbllist){ $tblcnt=0; $apiURL=$apiURL . "&for=${geo}&key=${censusAPIkey}"; $mech->get($apiURL); $numreq++; $json = decode_json($mech->text); if(!(exists($data[0][0]))){@data = @{$json};} else{ my @moredata = @{$json}; my $lastdata = 0; while($data[0][$lastdata] =~ /${table}_/){$lastdata++;} for(my $j=0; $j<=$#{$data[0]}; $j++){ for(my $k=$lastdata; $k<=$lastdata+$#{$moredata[0]}; $k++){ $data[$j][$k]=$moredata[$j][$k-$lastdata]; } } } $apiURL = "${apibaseURL}?get="; } else{$apiURL = $apiURL . ",";} } print "Data retrieved in $numreq http GET requests \n"; ############################################## # Print the data in .csv format, mainly to # demonstrate how to read the @data array. ############################################## open(OUT, ">$table.csv"); print "Writing to $table.csv \n"; # Data Headers my $j = 0; my $i = 0; while(exists($data[$j][$i])){ while(exists($data[$j][$i])){ if($data[$j][$i] =~ /${table}_/){ print OUT $columns{$data[$j][$i]}, ","; } else{ print OUT $data[$j][$i], ","; } $i++; } print OUT "\n"; $j++; $i = 0; } close(OUT);