%%% ========================================================================= %%% FLORID Example: INFORMATION INTEGRATION: %%% %%% GLOBAL STATISTICS: file:/home/dbis/.public_html/.mir/wol/index.html %%% %%% Author: Wolfgang May %%% %%% mondial-gs.flp: read global statictics source %%% ========================================================================= % ?- sys.theOMAccess.debugOn. % trace Web access ?- sys.theOM.eqTraceOn. % trace derived equalities ?- sys.prn.style@("bound"). % set output mode % /* Define several entry points for the data source: */ % gs[home -> "http://home.worldonline.nl/~quark/" ; mirror -> "file:/home/dbis/.public_html/.mir/wol/" ]. % gs[src->gs.mirror]. /* Several auxiliary formatting methods are defined, using \Florid's perl interface. Mainly, expressions */ % format[country_name->"$out[0]=$in[0]; $out[0]=~s|\Athe \s*||g; $out[0]=~s|\AThe \s*||g; $out[0]=~s/\s*\Z//g; $out[0]=~s/([^,]*), South/South $1/g; $out[0]=~s/([^,]*), North/North $1/g; $out[0]=~s/,.*//g; $out[0]=~s/St\./Saint/g; $out[0]=~s/ & / and /g; $out[0]=~s/'//g; $out[0]=~s/ I\Z//g;"; capital->"$out[0]=$in[0]; $out[0]=~s/'//g; $out[0]=~s/([^(]*)\((.*)/$1/g; $out[0]=~s/([^;]*);(.*)/$1/g; $out[0]=~s/([^,]*),(.*)/$1/g; $out[0]=~s|([^/]*)/(.*)|$1|g; $out[0]=~s/\s*\Z//g"; province->"$out[0]=$in[0]; $out[0]=~s/'//g; $out[0]=~s/\?//g; $out[0]=~s/([^(]*)\((.*)/$1/g; $out[0]=~s/([^[]*)\[(.*)/$1/g; $out[0]=~s/-/ /g; $out[0]=~s/ & / and /g; $out[0]=~s/s[^a-z ]ka/ska /g; $out[0]=~s/z[^a-z ]ka/zka /g; $out[0]=~tr/ÄáâãäåÉéèíïìÍÖóôöÜúüçñ/AaaaaaEeeiiiIOoooUuucn/; $out[0]=~s/\s*\Z//g"; city_name->"$out[0]=$in[0]; $out[0]=~s/'//g; $out[0]=~s/`//g; $out[0]=~s/ CDP//g; $out[0]=~s/Adazapari/Adapazari/g; $out[0]=~s/Karkuk/Kirkuk/g; $out[0]=~s/Baranquilla/Barranquilla/g; $out[0]=~s/Ferrant/Ferrand/g; $out[0]=~s/Santafé de Bogotá/Bogota/g; $out[0]=~s/Metropolitan Lima/Lima/g; $out[0]=~s/Recklingshausen/Recklinghausen/g; $out[0]=~s/Petropavlovsk.\n.Kamchatsky/Petropavlovsk Kamchatsky/g; $out[0]=~s/Petropavlovsk...Kamchatsky/Petropavlovsk Kamchatsky/g; $out[0]=~s/([^,]*),(.*)/$1/g; $out[0]=~s|([^/]*)/(.*)|$1|g; $out[0]=~s/([^(]*)\((.*)\)/$1/g; $out[0]=~s/St.([A-Z])/St. $1/g; $out[0]=~s/-/ /g; $out[0]=~tr/ÄáâãäåÉéèíïìÍÖóôöÜúüçñ/AaaaaaEeeiiiIOoooUuucn/; $out[0]=~s/\s*\Z//g"]. % /* Decimal commas are deleted.*/ % replace[kommas->"$out[0]=$in[0]; $out[0]=~s|\s*||g; $out[0]=~s|\..*||g; $out[0]=~s/\(.*\)//g; $out[0]=~s/([0-9]),([0-9])/$1$2/g; if (!($out[0] =~/[0-9]/)) {$out[0] = 0};"]. % replace[ixtodata->"$out[0]=$in[0]; $out[0]=~s/ix/data/g"]. % replace[spaces->"$out[0]=$in[0]; $out[0]=~s/ /-/g; $out[0]=~s/\s*[0-9]* / /g"]. % /* GS contains an error, confusing Congo (Zaire, formerly Belgian Congo) and Congo (Democratic Republic, formerly Congo-Brazzaville, belonging to french central africa). The main cities and provinces of Zaire are attributed to the Congo. */ % repl[congo_zaire->"$out[0]=$in[0]; $out[0]=~s/Congo, Dem.Rep./Zaire/g; $out[0]=~s/Congo/Zaire/g"]. % /* the US state abbreviations (postal codes) are mapped to the state names.*/ % us_postal_code("AL","Alabama"). us_postal_code("LA","Louisiana"). us_postal_code("OH","Ohio"). us_postal_code("AK","Alaska"). us_postal_code("ME","Maine"). us_postal_code("OK","Oklahoma"). us_postal_code("AZ","Arizona"). us_postal_code("MD","Maryland"). us_postal_code("OR","Oregon"). us_postal_code("AR","Arkansas"). us_postal_code("MA","Massachusetts"). us_postal_code("PA","Pennsylvania"). us_postal_code("CA","California"). us_postal_code("MI","Michigan"). us_postal_code("RI","Rhode Island"). us_postal_code("CO","Colorado"). us_postal_code("MN","Minnesota"). us_postal_code("SC","South Carolina"). us_postal_code("CT","Connecticut"). us_postal_code("MS","Mississippi"). us_postal_code("SD","South Dakota"). us_postal_code("DE","Delaware"). us_postal_code("MO","Missouri"). us_postal_code("TN","Tennessee"). us_postal_code("DC","Distr. Columbia"). us_postal_code("MT","Montana"). us_postal_code("TX","Texas"). us_postal_code("FL","Florida"). us_postal_code("NE","Nebraska"). us_postal_code("UT","Utah"). us_postal_code("GA","Georgia"). us_postal_code("NV","Nevada"). us_postal_code("VT","Vermont"). us_postal_code("HI","Hawaii"). us_postal_code("NH","New Hampshire"). us_postal_code("VA","Virginia"). us_postal_code("ID","Idaho"). us_postal_code("NJ","New Jersey"). us_postal_code("WA","Washington"). us_postal_code("IL","Illinois"). us_postal_code("NM","New Mexico"). us_postal_code("WI","Wisconsin"). us_postal_code("IN","Indiana"). us_postal_code("NY","New York"). us_postal_code("WY","Wyoming"). us_postal_code("IA","Iowa"). us_postal_code("NC","North Carolina"). us_postal_code("KS","Kansas"). us_postal_code("ND","North Dakota"). us_postal_code("KY","Kentucky"). % /* \paragraph{Continents.} The continents are associated manually with the respective pages.*/ % europe:continent[file@(gs)->"europe/euix.htm"; name->"Europe"]. asia:continent[file@(gs)->"asia/asix.htm"; name->"Asia"]. america:continent[file@(gs)->"america/amix.htm"; name->"America"]. australia:continent[file@(gs)->"oceania/ocix.htm"; name->"Australia/Oceania"]. africa:continent[file@(gs)->"africa/afix.htm"; name->"Africa"]. % C[url@(gs)->U] :- C:continent[file@(gs)->A], strcat(gs.src,A,U). % %europe:rel_continent. % /* \flrule{rel\_continent} can be initialized to restrict the evaluation to a single continent. Otherwise, all continents are made relevant.*/ % exists_rel_continent :- C:rel_continent. ?- sys.strat.doIt. C: rel_continent :- C:continent, not exists_rel_continent. ?- sys.strat.doIt. % /* For all relevant continents, the respective WWW pages are accessed by the built-in method \flrule{url[get\Fd webdoc]}. Additionally, the continent data pages (urls obtained by replacing ``ix'' in the continent's url by ``data'') are parsed using the built-in HTML parser. */ % ?- sys.echo@("*** ACCESSING GS CONTINENT PAGES ***"). % C:rel_continent[data_url@(gs)->U2] :- C:rel_continent[url@(gs)->U], perl(replace.ixtodata,U,U2). % U:url.get :- C:rel_continent[url@(gs)->U]. U:url.parse :- C:rel_continent[data_url@(gs)->U]. % ?- sys.strat.doIt. % /* \paragraph{General Country Data.} The continent data pages contain tables with information about the countries. After parsing, this information is contained in the parse-tree: The class \flrule{U.parse.table} contains all tables of \flrule{U.parse} (here, this is exactly one table per continent page). For a table \flrule{T}, \flrule{T.table@(0)} contains the table body \flrule{TB}, \flrule{TB.tbody@($n$)} is the $n$-th table row, from which \flrule{tr@($k$)} contains the $k$-th column which is divided into several parts by the innermost tags. Depending how many parts this are, and which of them are strings, the contents of the table entry can be derived. */ % ?- sys.echo@("*** analyzing GS continent pages for country Data ***"). % element(Table,Row,Column)[contents->Contents;type->Type] :- CT:continent[data_url@(gs)->U], Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_[tr@(Column)->X [Type@(0)->Contents]]], not X.Type@(1)[]. % element(Table,Row,Column)[contents->Contents;type->Type] :- CT:continent[data_url@(gs)->U], Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_[tr@(Column)->X [Type@(0)->Contents;Type@(1)->I0]]], not string(I0), string(Contents). % element(Table,Row,Column)[contents->Contents;type->Type] :- CT:continent[data_url@(gs)->U], Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_[tr@(Column)->X [Type@(0)->I0;Type@(1)->Contents]]], not string(I0), string(Contents). % ?- sys.strat.doIt. % /* The individual table entries give the name of the country (0th column), its capital name (1st column), its area (2nd column), and its population (3rd column). */ % cid(gs,CT,DZ):country[name_str->N2; continent@(gs)->>CT; population@(gs)->P2] :- CT:continent[data_url@(gs)->U; name->CN], T:(U.parse.table), element(T,DZ,0)[contents->N;type->td], string(N), not N="Total", element(T,DZ,3)[contents->P;type->td], perl(replace.kommas, P, P2), perl(format.country_name, N, N2). C[capital_name->>Cap2] :- C:country[name_str->N2], CT:continent[data_url@(gs)->U], T:(U.parse.table), element(T,DZ,0)[contents->N;type->td], string(N), not N="Total", element(T,DZ,1)[contents->Cap;type->td], perl(format.country_name, N, N2), perl(format.capital, Cap, Cap2), strlen(Cap,L), L > 2. % ?- sys.strat.doIt. % C[name@(gs)->>"USA"] :- C:country[name@(gs)->>"United States"]. % /* Country names of the form ``Myanmar (Burma)'' (appearing on the Myanmar page) or ``Holy See (Vatican City)'' have to be resolved into two alternative names. */ % C[name@(gs)->>N] :- C:country[name_str->N], not substr("(",N). C[name@(gs)->>{N1,N2}] :- C:country[name_str->N], pmatch(N,"/([^(]*) \((.*)\)/", ["$1","$2"], [N1,N2]). % /* The continent pages given by \flrule{continent.url} contain links labeled by country names to pages containing administrative divisions and main cities for some of the countries. Other links should not be followed: */ % gs[exclude->>{"home", "general data", "cities", "population", "economy", "provinces", "Profiler", "Germany I" % excludes also Germany II! }]. % ?- sys.strat.doIt. % /* The remaining links on \flrule{continent.url} lead to country pages. Note that the slots \flrule{url.get[hrefs@(label)\Fd url]} has been filled automatically when calling \flrule{url.get}. */ % CT[link@(Label2)->>U] :- CT:rel_continent.url@(gs).get[hrefs@(Label) ->> U], not substr(gs..exclude, Label), perl(repl.congo_zaire,Label,Label2), %%%<<<<<<<< not Label = "". % /* \flrule{rel\_country} can be initialized to restrict the evaluation to some countries. Otherwise, all countries are made relevant.*/ % %"Germany":rel_country. %"Myanmar":rel_country. % exists_rel_country :- C:rel_country. ?- sys.strat.doIt. Country:rel_country :- CT:continent[link@(Country)->>U], not exists_rel_country. % %%% ========================================================================= /* \paragraph{Evaluating GlobalStatistics country pages.} The GlobalStatistics country pages contain tables with information about administrative divisions and main cities. They are parsed by the built-in SGML parser, obtaining a parse-tree of the page using a special tagfile (specifying which HTML tags are ignored or pruned). */ % ?- sys.echo@("*** ACCESSING GS COUNTRIES ***"). % ?- sys.theOMAccess.setTagFile@("mondial-gs.tags"). % /* For China and the USA, additional city pages are given.*/ % asia[link@("China")->>U] :- strcat(gs.src,"asia/chinacy.htm",U), asia:rel_continent, "China":rel_country. % america[link@("USA")->>U] :- strcat(gs.src,"america/usacity.htm",U), america:rel_continent, "USA":rel_country. % ?- sys.strat.doIt. % U:country_url.parse :- C:continent[link@(Country)->>U], Country:rel_country. % ?- sys.strat.doIt. % /* Similar to the continent data pages' tables, the table entries on the country data pages are translated from the parse tree. */ % ?- sys.echo@("*** Analyzing Tables ***"). % element(Table,Row,Column)[contents->Contents;type->Type] :- U:country_url, Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_ [tr@(Column)->X[Type@(0)->Contents]]], not X.Type@(1)[]. % element(Table,Row,Column)[contents->Contents;type->Type] :- U:country_url, Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_[tr@(Column)->X [Type@(0)->I0;Type@(1)->I1]]], string(I0), string(I1), not X.Type@(2)[], strcat(I0,I1,Contents). % element(Table,Row,Column)[contents->I1;type->Type] :- U:country_url, Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_[tr@(Column)->X [Type@(0)->I0;Type@(1)->I1]]], not string(I0), string(I1), not X.Type@(2)[]. % element(Table,Row,Column)[contents->Contents;type->Type] :- U:country_url, Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_[tr@(Column)->_ [Type@(0)->I0;Type@(1)->I1;Type@(2)->I2]]], string(I0), string(I1), string (I2), strcat(I0,I1,I01), strcat(I01,I2,Contents). % element(Table,Row,Column)[contents->Contents;type->Type] :- U:country_url, Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_[tr@(Column)->_ [Type@(0)->I0;Type@(1)->I1;Type@(2)->I2]]], not string(I0), string(I1), string (I2), strcat(I1,I2,Contents). % element(Table,Row,Column)[colspan->Span2] :- U:country_url, Table:(U.parse.table), Table.table@(0)[tbody@(Row)->_ [tr@(Column)->X[colspan -> Span]]], not Span = "1", string2float(Span,Span2). % /* The header row(s) are determined by the \texttt{\flq th\frq} tags. Additionally, for every column, the header entry (attribute name) is extracted.*/ % T[header_row->>HR] :- U:country_url, T:(U.parse.table), element(T,HR,_)[type->th]. % T[header@(HR,Col)->>String] :- U:country_url, T:(U.parse.table)[header_row->>HR], element(T,HR,Col)[contents->String;type->th]. % ?- sys.strat.doIt. % /* Most country pages contain two tables: one with administrative tables, the other with main cities. Additionally, for each of them, the table structure is varying. For the administrative divisions, the \flrule{capital} column is not contained in every table; for the main city table, the \emph{province} column is not contained in every table. In the first step, the tables of administrative divisions and the tables of main cities are identified (searching a table entry which contains the appropriate header string). Additionally, the countries to which the tables refer are identified.*/ % ?- sys.echo@("*** Administrative Divisions: Analyzing Tables ***"). % tabtype(maincitytab). tabtype(admdivtab). header(maincitytab,"/municipalities of (.*)/"). header(maincitytab,"/main cities of (.*)/"). header(admdivtab,"/administrative divisions of (.*)/"). % T:TabType[int_country_name->>LN; int_country_name->>N2] :- CT:continent[link@(LN)->>U], T:(U.parse.table), element(T,_,_)[contents->I], not substr("II",I), header(TabType,Pattern), tabtype(TabType), pmatch(I,Pattern,"$1",Name), perl(repl.congo_zaire,Name,Name2), %<<<<<<<<<<<<<<<< perl(format.country_name, Name2, N2). % ?- sys.strat.doIt. % /* References of the form ``Myanmar (Burma)'' have to be resolved into two alternative names, then the tables can be associated with the country objects. */ % T[country->>N] :- tabtype(TabType), T:TabType[int_country_name->>N], not substr("(",N). T[country->>{N1,N2}] :- T:TabType[int_country_name->>N], tabtype(TabType), pmatch(N,"/([^(]*) \((.*)\)/", ["$1","$2"], [N1,N2]). % ?- sys.strat.doIt. % C[TabType->T] :- C:country[name@(gs)->>N], T:TabType[country->>N], tabtype(TabType). % /* alternative country names known from table headers are propagated to countries. */ ?- sys.strat.doIt. C[name@(gs)->>N] :- C:country[TabType->T[country->>N]], tabtype(TabType). % ?- sys.strat.doIt. % /* \paragraph{Administrative Divisions.} For the administrative divisions, the the area, population and capital (not present in all tables) columns are identified. Note that \flrule{pop\_col} is multivalued. */ less(0,1). less(0,2). less(1,2). less(0,3). less(1,3). less(2,3). less(0,4). less(1,4). less(2,4). less(3,4). % ?- sys.strat.doIt. % T[name_col->0;area_col->>Col] :- T:admdivtab[header@(Row,Col)->>Strg], substr("area",Strg), not substr("total",Strg). %% Netherlands: total and land area % T[pop_col->>Col] :- T:admdivtab[header@(Row,Col)->>Strg], substr("pop",Strg). % T[cap_col->Col] :- T:admdivtab[header@(Row,Col)->>Strg], substr("cap",Strg). % ?- sys.strat.doIt. % /* Due to multispan's, the maximum value for area and capital column has to be taken (cf.\ Australia) */ % T[area_col->Col] :- Col = max{C [T]; T[area_col->>C]}. T[cap_col->Col] :- Col = max{C [T]; T[cap_col->>C]}. % ?- sys.strat.doIt. % /* for some of the countries, administrative divisions are organized in a two-level hierarchy, e.g., France and Italy. They can be detected by the existence of boldfaced table entries (table entries s.t. \flrule{entry.b@(0)} is defined.*/ % T.two_level[] :- T:admdivtab[area_col->AS], element(T,Z,AS)[contents->I;type->td], I[b@(0)->P]. % ?- sys.strat.doIt. % /* for some countries, the population of administrative divisions is given at different years. This can be detected by the number of answers to \flrule{C.adm\_div\_tab.pop\_col \mvd X}. In this case, for every population column, the years are derived and associates with the individual population columns. */ % T[morepops->>Yint;morepops@(Yint)->PS] :- N=count{PS [T]; T[pop_col->>PS]}, N > 1, T:admdivtab[header_row->>HR;pop_col->>PS], element(T,HR,PS)[contents->I;type->th], pmatch(I, "/([0-9]{4})/", "$1", Y), string2integer(Y,Yint). % ?- sys.strat.doIt. % /* For tables giving the population at several years, the most recent population value is relevant for \Mondial. In every case, the \emph{single-valued} method \flrule{C.adm\_div\_tab.pop\_col \Fd integer} contains the relevant population column. */ % T[pop_col->TPS] :- T:admdivtab..morepops[], TY=max{Y [T]; T[morepops->>Y]}, T[morepops@(TY)->TPS]. % T[pop_col->PS] :- T:admdivtab, not T..morepops[], T[pop_col->>PS]. % ?- sys.strat.doIt. % /* After analyzing the table structure, the administrative divisions can be extracted. */ % ?- sys.echo@("*** Administrative Divisions ***"). % /* Non-two level administrative divisions:*/ % C[adm_divs ->> prov(C,DRow):province [country->C;name_str->PN2;population->P2;area->A2]] :- C:country[admdivtab -> T[name_col->NS;area_col->AS;pop_col->PS]], not T.two_level[], element(T,DRow,NS)[contents->PN;type->td], not substr("Total", PN), not substr("total", PN), not substr("*",PN), % Russia: parts of other counties element(T,DRow,AS)[contents->A;type->td], perl(replace.kommas, A, A2), element(T,DRow,PS)[contents->P;type->td], pmatch(P,"/[0-9]/","$1",_), perl(replace.kommas, P, P2), perl(format.province, PN, PN2). % /* evaluating capital columns in non-two level tables.*/ % C[adm_divs ->> prov(C,DRow)[capital_str->CN2]] :- C:country[admdivtab -> T[cap_col->CS]], prov(C,DRow):province, not T.two_level[], element(T,DRow,CS)[contents->CN;type->td], perl(format.city_name, CN, CN2). % ?- sys.strat.doIt. % /* For two-level administrative divisions, only the upper level is relevant. This is represented by bold entries. */ % C[adm_divs ->> prov(C,DRow):province [country->C;name_str->PN2;population->P2;area->A2]] :- C:country[admdivtab -> T[name_col->NS;area_col->AS; pop_col->PS;two_level->_]], element(T,DRow,NS)[contents->NI;type->td], NI[b@(0)->PN], not PN="Total", not PN="total", not substr("*",PN), % Russia: parts of other counties element(T,DRow,AS)[contents->AI;type->td], AI[b@(0)->A], perl(replace.kommas, A, A2), element(T,DRow,PS)[contents->PI;type->td], PI[b@(0)->P], perl(replace.kommas, P, P2), perl(format.province, PN, PN2). % /* Capital columns in two-level tables: */ % C[adm_divs ->> prov(C,DRow):province[capital_str->CN2]] :- C:country[admdivtab -> T[name_col->NS;cap_col->CS;two_level->_]], element(T,DRow,NS)[contents->NI;type->td], NI[b@(0)->PN], not substr("*",PN), % Russia: parts of other counties element(T,DRow,CS)[contents->CI;type->td], CI[b@(0)->CN], not CN = " ", not CN = "-", perl(format.province, CN, CN2). % /* There are some exceptions with the names and capitals of administrative divisions: \\ Provinces where no capital is given are no exceptions. Some provinces have no name since they are identical with the municipalities which are their capitals. In this case the name is set to `` (munic.)''. In Great Britain, there seems to be a wrong capital entry for the province Rhondda. */ % P[name->PN] :- P:province[name_str->PN], not P.capital_str[]. % P[name->PN] :- P:province[name_str->PN;capital_str->CN], not CN = " ", not CN = "", not CN = "-", not CN = "..". % P[capital_name->>CN] :- P:province[name_str->PN;capital_str->CN], not substr("Rhondda",PN), not CN = " ", not CN = "", not CN = "-", not CN = "..". % P[capital_name->>"Rhondda"] :- %% Data Inconsistency in GS. P:province[name_str->PN], substr("Rhondda",PN). % ?- sys.strat.doIt. % P[name->PN2; capital_name->>PN] :- P:province[name_str->PN; capital_str->CN], not P.name[], strcat(PN," (munic.)",PN2). ?- sys.strat.doIt. % /* References of the form ``Rangoon (Yangon)'' have to be resolved into two alternative names. */ % ?- sys.strat.doIt. % P[capital_name->>{N1,N2}] :- P:province[capital_name->>N], pmatch(N,"/([^(]*) \((.*)\)/", ["$1","$2"], [N1,N2]). % ?- sys.strat.doIt. /* \paragraph{Main Cities.} For the main city tables, the header row(s) and the name and population columns (population columns together with the year) are identified. Note that \flrule{header\_row} and \flrule{pop\_col} are again multivalued. */ % ?- sys.echo@("*** Main Cities: Analyzing Tables ***"). % city_col_header("cit"). city_col_header("name"). city_col_header("agglomeration"). % ?- sys.strat.doIt. % T[city_col->>Col] :- T:maincitytab[header@(_,Col)->>Strg], substr(CSH,Strg), city_col_header(CSH). % T[pop_col->>Col; pop_year@(HR,Col)->Y2] :- T:maincitytab[header@(HR,Col)->>Strg], substr("pop",Strg), pmatch(Strg,"/19([0-9][0-9])/","$1",Y), string2integer(Y,Y2). % /* Exception: Brazil has a slightly different layout. */ % T[pop_col->>Col; pop_year@(HR,Col)->Y2] :- T:maincitytab[header@(HR,Col)->>Strg], substr("municipality",Strg), pmatch(Strg,"/19([0-9][0-9])/","$1",Y), string2integer(Y,Y2). % /* Several main city tables contain a column which gives the province where the city belongs to: */ % T[prov_col->Col] :- T:maincitytab[header@(_,Col)->>Strg], substr("prov",Strg). % ?- sys.strat.doIt. % /* There are also some main city tables which have a twocolumn layout. */ % T.twocolumn[] :- N=count{CS [T]; T:maincitytab[city_col->>CS]}, N > 1. % ?- sys.strat.doIt. % /* After analyzing the table structure, the main city information can be extracted. In case of tables with several header rows, one of them is chosen -- they are all the same. */ % ?- sys.echo@("*** Main Cities ***"). % /* Tables without province columns:*/ % C[main_cities ->> cty(C,DZ):city [country->C;namestr->CN;population@(Y)->P2]] :- C:country[maincitytab -> T[city_col->>CS;pop_col->>PS]], not T.prov_col[], element(T,DZ,CS)[contents->CN;type->td], element(T,DZ,PS)[contents->P;type->td], perl(replace.kommas, P, P2), H=max{HR [T,DZ,PS,P]; T[header_row->>HR;city_col->>CS;pop_col->>PS], element(T,DZ,CS)[contents->CN], element(T,DZ,PS)[contents->P], HR < DZ}, T[pop_year@(H,PS)->Y], not substr("Moskovskaya.obl.", CN), %% Russia. Two Kaliningrads not T.twocolumn[]. % /* Tables with a province column: */ C[main_cities ->> cty(C,DZ):city[country->C;namestr->CN; population@(Y)->P2;prov_name->Pr]] :- C:country[maincitytab -> T[city_col->>CS;pop_col->>PS;prov_col->PrS]], element(T,DZ,CS)[contents->CN;type->td], element(T,DZ,PS)[contents->P;type->td], perl(replace.kommas, P, P2), element(T,DZ,PrS)[contents->Pr;type->td], not Pr = "-", H=max{HR [T,DZ,PS,P]; T[header_row->>HR;city_col->>CS;pop_col->>PS], element(T,DZ,CS)[contents->CN], element(T,DZ,PS)[contents->P], HR < DZ}, T[pop_year@(H,PS)->Y], strcat(CN,Pr,CPS), not T.twocolumn[]. % /* Twocolumn tables: important to check if the last element in the second column actually contains data or if it is empty (by strlen)*/ % C[main_cities ->> cty(C,DZ,CS):city[country->C;namestr->CN;population@(Y)->P2]] :- C:country[maincitytab -> T[header_row->>HR;city_col->>CS]], element(T,DZ,CS)[contents->CN;type->td], P2 > 0, element(T,DZ,PS)[contents->P;type->td], PS = CS + 1, perl(replace.kommas, P, P2), T[pop_year@(HR,PS)->Y], T.twocolumn[]. % ?- sys.strat.doIt. % /* Names of the form ``Vienna (Wien)'' have to be resolved into two alternative names. Thus, \flrule{city.name@(gs)\Fd string} is multivalued. */ % C[name@(gs)->>Name2] :- C:city[namestr->CNS], pmatch(CNS,"/\A([^(,]*)/","$1",Name), perl(format.city_name, Name, Name2). % C[name@(gs)->>Name2] :- C:city[namestr->CNS], pmatch(CNS,"/\(([A-Z][a-zA-Z- ]*)\)/","$1",Name), not _:country[name@(gs)->>Name], not substr("(Guadalajara)",CNS), not substr("(Metro)",CNS), perl(format.city_name, Name, Name2). % ?- sys.strat.doIt. % /* The countries' capitals are linked to city objects (where these already exist) by \flrule{country[capital@(gs) \Fd city]}. */ % ?- sys.echo@("*** Linking capitals ***"). % C[capital->Capcty] :- C:country[capital_name->>CN], Capcty:city[country->C;name@(gs)->>CN]. % ?- sys.strat.doIt. % /*For countries where the capital is not a main city (i.e., those where no main city data exist), the capital objects have to be created and \flrule{country[capital\Fd city]} is defined. */ % ?- sys.echo@("*** generate city objects for capitals which are not main cities ***"). % C[capital->cty(C,CN2):city[country->C;name@(gs)->>CN]], C[main_cities->>cty(C,CN2)] :- C:country[capital_name->>CN], not C.capital[], not CN = "none", perl(replace.spaces,CN,CN2). % ?- sys.strat.doIt. % /* \paragraph{Linking Cities to Provinces.} Simple cases: */ % ?- sys.echo@("*** Referencing cities' province objects ***"). % Cty[province->>Prov] :- Cty:city[country->C;prov_name->PN], Prov:province[country->C;name->PN]. % ?-sys.strat.doIt. % /* In several cases, the province name given with the city is only one of to possible (e.g.\ ``K{\"a}rnten (Carinthia)''): */ Cty[country->C;province->>Prov] :- Cty:city[country->C;prov_name->PN], not Cty..province[], Prov:province[country->C;name->N], strcat("/\(",PN,PN1), strcat(PN1,"\)/",PN2), pmatch(N,PN2,"$1",X). % ?-sys.strat.doIt. % /* In case of abbreviations, the matching is done by the first five letters: */ % Cty[country->C;province->>Prov] :- Cty:city[country->C;prov_name->PN], not Cty..province[], Prov:province[country->C;name->N], pmatch(PN,"/.{5}/","$1",_), pmatch(PN,"/\A(.*).\Z/","$1/",PN1), strcat("/\A",PN1,PN2), pmatch(N,PN2,"$1",_). % ?-sys.strat.doIt. % /* For the US, the postal codes have to be translated to the states' names, e.g., ``New York, NY''):*/ % C[province->>Pr] :- C:city[namestr->CNS].country[name@(gs)->>"USA"], pmatch(CNS,"/, ([A-Z]{2})/","$1",Code), us_postal_code(Code,P), Pr[name->P].country[name@(gs)->>"USA"]. % /* Note that Canadian provinces remain unresolved since non-official abbrevs are used. */ % ?- C:city[country->CT;prov_name->PN], not C..province[]. % ?-sys.strat.doIt. % /* \paragraph{Linking Capitals of Provinces to Cities.} The provinces' capitals are linked to city objects (where these already exist) by \flrule{province[capital\Fd city]}. If the capital city is not yet associated with a province, set \flrule{city[province\Fd province]}. */ % ?- sys.echo@("*** Referencing province's capital objects ***"). % P[capital->Capcty[province->>P]] :- P:province[country->C;capital_name->>CN], Capcty:city[country->C;name@(gs)->>CN], not Capcty..province[]. % P[capital->Capcty] :- P:province[country->C;name->PN;capital_name->>CN], Capcty:city[country->C;name@(gs)->>CN;province->>P[name->PN]]. % ?- sys.strat.doIt. % /*For provinces where the capital is not a main city (i.e., those where no main city data exist), the capital objects have to be created. */ % ?- sys.echo@("*** generate city objects for capitals of adm_divs ***"). % P[capital->cty(C,CN2):city [country->C;name@(gs)->>CN;province->>P]], C[main_cities->>cty(C,CN2)] :- P:province[country->C;capital_name->>CN], not P.capital[], perl(replace.spaces,CN,CN2). % ?-sys.strat.doIt. % /* Thailand: cities are not associated to provinces. But the names match.*/ Cty[province->>P], P[capital->Cty] :- C:country[name@(gs) ->> "Thailand"], Cty:city[country->C; name@(gs)->>N], P:province[country->C; name->M], substr(N,M). /* \paragraph{Output.} Using the \flrule{instance} output format, the resulting atoms are selected for output to \flrule{mondial-gs-facts.flp}.*/ % ?- sys.echo@("*** Generating Output ***"). ?- sys.strat.doIt. % save[init->"`rm ../Mondial-facts/mondial-gs-facts.flp`; @out=@in;"]. % ?- sys.strat.doIt. ?- perl(save.init,[""],[""]). % ?- sys.prn.style@("instance"). % set output mode ?- sys[output->sys.open@("../Mondial-facts/mondial-gs-facts.flp")]. ?- sys.answerChannel.setStream@(output)[]. % /* The following output queries also define the schema of the GlobalStatistics part of the database */ % ?- C:continent[name->N]. ?- C:country[name@(gs)->>N;continent@(gs)->>CT]. ?- C:country[capital->Cap;population@(gs)->Pop]. ?- C:country[main_cities->>City:city[country->C;name@(gs)->>CN]]. ?- C:city[population@(Y)->P]. ?- C:country[adm_divs->>Pr:province[country->C;name->PN;population->P]]. ?- C:city[province->>Pr]. ?- Pr:province[capital->C]. ?- Pr:province[area->AR]. % ?- sys.answerChannel.resetStream[]. ?- sys.remove@("output").