#!/usr/bin/ruby
# $Id: parse-oldcurlists.rb 7391 2020-12-25 19:17:37Z flaterco $
#
# The undocumented MDAPI (1.0) geogroup children query (getting stations)
# does not work for currents.  In lieu of that, parse geogroups out of the
# old style station list web pages and populate a table with them.
#
# Precedents:
#   harmbase2-20161231/nos/preparse_current_predictions_pages.rb
#   harmbase2-20161231/importNOSSC.pgcc
#   harmbase2-20161231/importNOSSC.pgcc hdr2_fixups

# Input: Stations?g=*
# 3 levels of headings
#   1. <h1><small>
#   2. <h5>
#   3. <h5>&nbsp;&nbsp;&nbsp;&nbsp;
# 2 levels of stations
#   1. N/A
#   2. <td>&nbsp;&nbsp;&nbsp;&nbsp;<a
#   3. <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<a
# "Weak and Variable" stations with no predictions lack the link to
# <a href='Predictions?id=*

require "set"
require "pg"

# If these headers munge in a subregion, the value is an array of size 2
# instead of a string.
hdr2_fixups = {
  "NANTUCKET SOUND ENTRANCE" => "Nantucket Sound entrance",
  "NANTUCKET SOUND" => "Nantucket Sound",
  "VINEYARD SOUND" => "Vineyard Sound",
  "NARRAGANSETT BAY" => "Narragansett Bay",
  "EAST RIVER" => "East River",
  "HARLEM RIVER" => "Harlem River",
  "KILL VAN KULL" => "Kill Van Kull",
  "CHESAPEAKE BAY" => "Chesapeake Bay",
  "POTOMAC RIVER" => "Potomac River",
  "PATUXENT RIVER" => "Patuxent River",
  "LITTLE CHOPTANK RIVER" => "Little Choptank River",
  "CHOPTANK RIVER" => "Choptank River",
  "EASTERN BAY" => "Eastern Bay",
  "CHESTER RIVER" => "Chester River",
  "SASSAFRASS RIVER" => "Sassafrass River",
  "ELK RIVER" => "Elk River",
  "PAMLICO SOUND" => "Pamlico Sound",
  "CAPE FEAR RIVER" => "Cape Fear River",
  "WINYAH BAY" => "Winyah Bay",
  "CHARLESTON HARBOR" => "Charleston Harbor",
  "STONO RIVER" => "Stono River",
  "PORT ROYAL SOUND" => "Port Royal Sound",
  "CALIBOGUE SOUND" => "Calibogue Sound",
  "SAVANNAH RIVER" => "Savannah River",
  "WASSAW SOUND" => "Wassaw Sound",
  "OSSABAW SOUND" => "Ossabaw Sound",
  "ST. CATHERINES SOUND" => "St. Catherines Sound",
  "SAPELO SOUND" => "Sapelo Sound",
  "DOBOY SOUND" => "Doboy Sound",
  "ALTAMAHA SOUND" => "Altamaha Sound",
  "ST. SIMONS SOUND" => "St. Simons Sound",
  "ST. ANDREWS SOUND" => "St. Andrews Sound",
  "ST. JOHNS RIVER" => "St. Johns River",
  "SARASOTA BAY" => "Sarasota Bay",
  "TAMPA BAY" => "Tampa Bay",
  "APALACHEE BAY" => "Apalachee Bay",
  "GALVESTON BAY" => "Galveston Bay",
  "SAN DIEGO BAY" => "San Diego Bay",
  "YAQUINA BAY" => "Yaquina Bay",
  "TILLAMOOK BAY" => "Tillamook Bay",
  "GRAYS HARBOR" => "Grays Harbor",
  "STRAIGHT OF JUAN DE FUCA" => "Strait of Juan de Fuca",
  "ADMIRALTY INLET" => "Admiralty Inlet",
  "HOOD CANAL" => "Hood Canal",
  "PUGET SOUND" => "Puget Sound",
  "ROSARIO STRAIT" => "Rosario Strait",
  "SAN JUAN CHANNEL" => "San Juan Channel",
  "GEORGIA STRAIT" => "Georgia Strait",
  "DIXON ENTRANCE" => "Dixon Entrance",
  "PEARSE CANAL" => "Pearse Canal",
  "PORTLAND CANAL" => "Portland Canal",
  "REVILLAGIGEDO CHANNEL" => "Revillagigedo Channel",
  "TONGASS NARROWS" => "Tongass Narrows",
  "FELICE STRAIT" => "Felice Strait",
  "NICHOLS PASSAGE" => "Nichols Passage",
  "BEHM CANAL" => "Behm Canal",
  "CLARENCE STRAIT" => "Clarence Strait",
  "ERNEST SOUND" => "Ernest Sound",
  "ZIMOVIA STRAIT" => "Zimovia Strait",
  "CORDOVA BAY" => "Cordova Bay",
  "TLEVAK STRAIT" => "Tlevak Strait",
  "MEARES PASSAGE" => "Meares Passage",
  "ULLOA CHANNEL" => "Ulloa Channel",
  "EL CAPITAN PASSAGE" => "El Capitan Passage",
  "SUMNER STRAIT" => "Sumner Strait",
  "KEKU STRAIT" => "Keku Strait",
  "WRANGELL NARROWS" => "Wrangell Narrows",
  "FREDERICK SOUND" => "Frederick Sound",
  "STEPHENS PASSAGE" => "Stephens Passage",
  "CHATHAM STRAIT" => "Chatham Strait",
  "SITKA SOUND" => "Sitka Sound",
  "KRESTOF SOUND" => "Krestof Sound",
  "NEVA STRAIT" => "Neva Strait",
  "SALISBURY SOUND" => "Salisbury Sound",
  "PERIL STRAIT" => "Peril Strait",
  "ICY BAY" => "Icy Bay",
  "CONTROLLER BAY" => "Controller Bay",
  "SHUMAGIN ISLANDS" => "Shumagin Islands",
  "UNIMAK ISLAND" => "Unimak Island",
  "ISANOTSKI STRAIT" => "Isanotski Strait",
  "ALEUTIAN ISLANDS" => "Aleutian Islands",
  "PORT MOLLER" => "Port Moller",
  "KVICHAK BAY" => "Kvichak Bay",
  "KUSKOKWIM BAY" => "Kuskokwim Bay",
  "KENNEBEC RIVER" => "Kennebec River",
  "CASCO BAY" => "Casco Bay",
  "PORTSMOUTH HARBOR" => "Portsmouth Harbor",
  "CAPE COD BAY" => "Cape Cod Bay",
  "CAPE COD CANAL" => "Cape Cod Canal",
  "BUZZARDS BAY" => "Buzzards Bay",
  "BLOCK ISLAND SOUND" => "Block Island Sound",
  "FISHERS ISLAND SOUND" => "Fishers Island Sound",
  "LONG ISLAND SOUND" => "Long Island Sound",
  "LONG ISLAND, south coast" => "Long Island",
  "JAMAICA BAY" => "Jamaica Bay",
  "ARTHUR KILL" => "Arthur Kill",
  "CARQUINEZ STRAIT" => "Carquinez Strait",
  "CARROLL INLET" => "Carroll Inlet",
  "CHESAPEAKE and DELAWARE CANAL" => "Chesapeake and Delaware Canal",
  "COOK INLET" => "Cook Inlet",
  "CROSS SOUND" => "Cross Sound",
  "CUMBERLAND SOUND" => "Cumberland Sound",
  "ELIZABETH RIVER" => "Elizabeth River",
  "FORT PIERCE INLET" => "Fort Pierce Inlet",
  "Galveston Bay" => "Galveston Bay",
  "GREAT WICOMICO RIVER" => "Great Wicomico River",
  "HAMPTON ROADS" => "Hampton Roads",
  "HUDSON RIVER, Midchannel" => "Hudson River (midchannel)",
  "HUMBOLDT BAY" => "Humboldt Bay",
  "JAMES RIVER" => "James River",
  "Laguna Madre" => "Laguna Madre",
  "LAKE WORTH INLET" => "Lake Worth Inlet",
  "LITUYA BAY" => "Lituya Bay",
  "LYNN CANAL" => "Lynn Canal",
  "Matagorda Bay" => "Matagorda Bay",
  "MIAMI HARBOR" => "Miami Harbor",
  "MISSISSIPPI SOUND" => "Mississippi Sound",
  "MOBILE BAY" => "Mobile Bay",
  "MONTERY BAY" => "Monterey Bay",
  "MONTEREY BAY" => "Monterey Bay",
  "NANSEMOND RIVER" => "Nansemond River",
  "NASSAU SOUND" => "Nassau Sound",
  "NEWARK BAY" => "Newark Bay",
  "NORTH EDISTO RIVER" => "North Edisto River",
  "OLGA STRAIT" => "Olga Strait",
  "ORCA BAY" => "Orca Bay",
  "PATAPSCO RIVER" => "Patapsco River",
  "PENSACOLA BAY" => "Pensacola Bay",
  "POCOMOKE SOUND" => "Pocomoke Sound",
  "PORT EVERGLADES" => "Port Everglades",
  "PRINCE WILLIAM SOUND" => "Prince William Sound",
  "RAPPAHANNOCK RIVER" => "Rappahannock River",
  "RARITAN BAY" => "Raritan Bay",
  "RARITAN RIVER" => "Raritan River",
  "Sabine Pass" => "Sabine Pass",
  "SACRAMENTO RIVER" => "Sacramento River",
  "SAN FRANCISCO BAY, North" => "San Francisco Bay",
  "SAN FRANCISCO BAY, South" => "San Francisco Bay",
  "SAN JOAQUIN RIVER" => "San Joaquin River",
  "SAN PABLO BAY" => "San Pablo Bay",
  "SANTEE RIVERS" => "Santee Rivers",    # Semi-ambiguous.
  "SOUTH EDISTO RIVER" => "South Edisto River",
  "ST HELENA SOUND" => "St. Helena Sound",
  "ST. ANDREW BAY" => "St. Andrew Bay",
  "STIKINE STRAIT" => "Stikine Strait",
  "SUISUN BAY" => "Suisun Bay",
  "TANGIER SOUND" => "Tangier Sound",
  "YORK RIVER" => "York River",
  "NEW YORK HARBOR ENTRANCE, Ambrose Channel" => "New York Harbor entrance",
  "NEW YORK HARBOR, Lower Bay" => ["New York Harbor", "Lower Bay"],
  "NEW YORK HARBOR, Upper Bay" => ["New York Harbor", "Upper Bay"],

  # Redundant ones
  "PUERTO RICO" => nil,
  "BAY OF FUNDY" => nil,
  "MAINE COAST" => nil,
  "MASSACHUSETTS COAST" => nil,
  "NEW JERSEY COAST" => nil,
  "NORTH CAROLINA COAST" => nil,
  "SOUTH CAROLINA COAST" => nil,
  "TEXAS COAST" => nil,
  "LOUISIANA COAST" => nil,
  "LOWER CALIFORNIA" => nil,
  "CALIFORNIA COAST" => nil,
  "OREGON COAST" => nil,
  "WASHINGTON COAST" => nil,
  "WASHINGTON-BRITISH COLUMBIA COAST" => nil,
  "VIRGINIA, outer coast" => nil,
  "ALASKA PENINSULA" => nil,
  "BERING SEA" => nil,

  # Ambiguous ones
  "SEVERN and MAGOTHY RIVERS" => nil,
  "BACK, GUNPOWDER and BUSH RIVERS" => nil,
  "NEW and WRIGHT RIVERS" => nil,
  "FLORIDA REEFS to MIDNIGHT PASS" => nil,
  "BOCA CIEGA BAY and ST. JOSEPH SOUND" => nil,
  "POSSESSION SOUND-SKAGIT BAY" => nil,
  "HARO STRAIT AND BOUNDARY PASS" => nil,
  "HECATE STRAIT AND CHATHAM SOUND" => nil,
  "BLAKE CHANNEL AND EASTERN PASSAGE" => nil,
  "BUCARELLI BAY TO DAVIDSON INLET" => nil,
  "NAKWASINA SOUND AND PASSAGE" => nil,
  "HERENDEEN BAY - PORT HEIDEN" => nil,
  "NUSHAGAK BAY AND APPROACHES" => nil,
  "PISCATAQUA RIVER and tributaries" => nil,
  "BOSTON HARBOR APPROACHES" => nil,
  "BOSTON HARBOR-PRESIDENT ROADS" => nil,
  "BOSTON HARBOR-NANTASKET ROADS" => nil,
  "BOSTON HARBOR-HINGHAM BAY" => nil,
  "VINEYARD SOUND-BUZZARDS BAY" => nil,
  "GARDINERS BAY, etc." => nil,
  "Aransas Bay and Corpus Christi Bay" => nil,
  "Brazos Santiago Pass and Port Isabel" => nil,
  "DEL., MD. And VA. COAST" => nil,
  "DELAWARE BAY and RIVER" => nil,
  "GLACIER BAY AND ICY STRAIT" => nil,
  "GOLDEN GATE AND APPROACHES" => nil,
  "KHAZ BAY TO CAPE EDWARD" => nil,
  "WEST and SOUTH RIVERS" => nil,
  "MOBJACK BAY and PIANKATANK RIVER" => nil,
  "SHELIKOF STRAIT AND KODIAK ISLANDS" => nil,
  "HAWAIIAN ISLANDS" => nil,
  # Ugh...
  "COLUMBIA RIVER AND APPROACHES" => nil
}

db = PG::Connection.open(:dbname => "harmbase2")
begin
  res = db.exec("
    create table currents_geogroups (
      sid    text primary key,  -- NOT sidplus, NO bin numbers
      state  text not null,     -- level 5 in MDAPI tides
      region text,              -- level 6, sometimes nulled out
      subregion text            -- level 7, often redundant with station name
    )
  ")
rescue PG::DuplicateTable
  puts "Table currents_geogroups already exists; continuing..."
end

numlvls = 3
headers = Array.new(numlvls,"ERROR")
donesids = Set.new

lines = ARGF.readlines # slurp
i=0
while i < lines.size
  line = lines[i]
  if line.include? "<h1>"
    f = line.scan(/<h1><small>([^<]+)/)[0]
    headers[0] = f[0]
    headers.fill("ERROR",1..(numlvls-1))
    i += 1
  elsif line.include? "<h5>"
    if line.include? "&nbsp;"
      f = line.scan(/<h5>(&nbsp;)+([^<]+)/)[0]
      headers[2] = f[1]
      # Ambiguous ones have appeared:
      #   Montague Strait and Knight Island Passage
      #   Orca Bay and Orca Inlet
      headers[2] = nil if f[1].include?(" and ")
      headers.fill("ERROR",3..(numlvls-1))
    else
      f = line.scan(/<h5>([^<]+)/)[0]
      if hdr2_fixups.include?(f[0])
        headers[1] = hdr2_fixups[f[0]]
      else
	# raise "Have no fixup for geogroup name #{f[0]}"
        headers[1] = "FIXME"
        dc = f[0].split.map(&:capitalize).join(' ')
        print "  \"#{f[0]}\" => \"#{dc}\",\n"
      end
      headers.fill("ERROR",2..(numlvls-1))
    end
    i += 1
  # preparse_current_predictions_pages.rb skipped the "Weak and variable"
  # ones, but we need them now.  We don't need the sidplus from the
  # predictions link, coordinates, or other misc.
  elsif line.include? "onmouseover"
    hlvl = line.scan(/&nbsp/).size / 4
    i += 2
    line = lines[i]
    f = line.scan(/>([^<]+)/)[0]
    sid = f[0]
    i += 9

    unless donesids.include?(sid)

      if headers[1].kind_of?(Array)
        raise "Subregion conflict with munged region" unless hlvl < 2
        h1 = headers[1][0]
        h2 = headers[1][1]
      else
        h1 = headers[1]
        h2 = (hlvl < 2 ? nil : headers[2])
      end

      res = db.exec("insert into currents_geogroups values ($1, $2, $3, $4)",
        [sid, headers[0], h1, h2]);
      donesids.add(sid)
    end

  else
    i += 1
  end
end
