$url, 'cache' => 'wikipedia', 'return_data' => true )); if ($d->get_data_to_file()) { return $d->get_data(); } else { return null; } } function get_wikipedia_page($page, $site, $langsearch) { // $page will be eg 'Air_(French_band)' // $site will be eg 'en.wikipedia.org' // $langsearch is true if we want to find a page in the user's language // $domain is the language the user wants to use - eg 'fr' global $domain; global $userdomain; global $mobile; // $request_domain is the language of the page we've been asked to get $r = preg_match("#(.*?)\.#", $site, $matches); $request_domain = $matches[1]; $format_domain = $request_domain; $req = ""; if ($langsearch) { logger::log("WIKIPEDIA", "Request for page ".$page." from ".$site.". Domain is ".$request_domain." and user domain is ".$domain); $user_link = ($request_domain == $domain) ? $page : null; $english_link = ($site == "en.wikipedia.org") ? $page : null; logger::log("WIKIPEDIA", "User Link is ".$user_link." and english link is ".$english_link); if ($domain != $request_domain) { logger::log("WIKIPEDIA", "Asked for page ".$page." from site ".$site." but user wants domain ".$domain); // Find language links for the requested page $langlinks = wikipedia_request("http://".$site."/w/api.php?action=query&prop=langlinks&titles=".$page."&format=xml"); if ($langlinks !== null) { $langs = simplexml_load_string($langlinks); if ($langs->query->pages->page->langlinks) { foreach($langs->query->pages->page->langlinks->ll as $ll) { $l = $ll['lang']; $t = dom_import_simplexml($ll)->textContent; logger::log("WIKIPEDIA", "Found language link ".$l." title ".$t); if ($l == $domain) { $user_link = preg_replace('/ /', '_', $t); } if ($l == "en" && $english_link == null) { $english_link = preg_replace('/ /', '_', $t); } } } } } logger::log("WIKIPEDIA", "Language Scan Complete for ".$page); logger::log("WIKIPEDIA", "User Link is ".$user_link." and english link is ".$english_link); if ($user_link !== null) { $format_domain = $domain; $userdomain = true; $page = $user_link; $site = $domain.'.wikipedia.org'; } else if ($english_link !== null) { $page = $english_link; $site = "en.wikipedia.org"; $format_domain = "en"; } } if ($mobile) { $req = 'http://'.$site.'/w/api.php?action=mobileview§ions=all&prop=text&page='.$page.'&format=xml'; } else { $req = 'http://'.$site.'/w/api.php?action=parse&prop=text&page='.$page.'&format=xml'; } $xml = wikipedia_request($req); if ($xml !== null) { $info = ""; if ($mobile) { $info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); $reformat = ''; foreach($info->mobileview->sections->section as $section) { $reformat .= htmlspecialchars($section, ENT_QUOTES); } $reformat .= ''.$format_domain.''.$page.''; return $reformat; } else { $info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); $html = $info->parse->text; $matches = array(); if (preg_match( '/REDIRECT
  • addChild('rompr'); $meta->addChild('domain', $domain); $meta->addChild('page', $page); return $xml->asXML(); } function join_responses($bits) { $t = ""; $d = ""; $p = ""; foreach ($bits as $b) { $info = simplexml_load_string($b, 'SimpleXMLElement', LIBXML_NOCDATA); $t .= htmlspecialchars($info->parse->text, ENT_QUOTES); $d = $info->rompr->domain; $p = $info->rompr->page; } $reformat = ''.$t.''.$d.''.$p.''; return $reformat; } function send_result($xml) { header('Content-Type: text/xml'); print $xml; } function send_failure($term) { $xml = ''; $xml .= htmlspecialchars('

    ', ENT_QUOTES).get_int_text("wiki_fail", array($term)).htmlspecialchars('

    ', ENT_QUOTES); $xml .= '
    '; $xml .= 'nullnull
    '; send_result($xml); } // ========================================================================== // // Utility Functions // // ========================================================================== function prepare_string($searchstring) { // Escape naughty characters $searchstring = preg_replace( '/(\(|\)|\^|\$|\\\\|\/)/', '\\\\$1', $searchstring ); return $searchstring; } function wikipedia_find_exact($searchfor, $domain) { $xml = wikipedia_request('http://'.$domain.'.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($searchfor) . '&srprop=score&format=xml'); if ($xml == null) { return ''; } $info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); $page = null; // This is international, so we only look for an exact match (we can't possibly translate every possibility that's in artist_search, etc) foreach ($info->query->search->p as $id) { $searchstring = $id['title']; $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $searchfor)) { $page = $id['title']; break; } } if ($page == null) { return ''; } else { return get_wikipedia_page(preg_replace('/ /', '_', $page), $domain.".wikipedia.org", false); } } function find_dismbiguation_page($page) { $searchfor = $page.' (disambiguation)'; logger::log("WIKIPEDIA", "Searching Wikipedia for ".$searchfor); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($searchfor) . '&srprop=score&format=xml'); $results = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); foreach ($results->query->search->p as $id) { if ($id['title'] == $searchfor) { logger::log("WIKIPEDIA", "returning disambiguation page for ".$page); return get_wikipedia_page(preg_replace('/ /', '_', $id['title']), "en.wikipedia.org", true); } } return ''; } function wikipedia_get_list_of_suggestions($term) { global $domain; logger::log("WIKIPEDIA", "Getting list of suggestions for ".$term." from ".$domain.".wikipedia.org"); $xml = wikipedia_request('http://'.$domain.'.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($term) . '&srprop=score&format=xml'); if ($xml != "") { $html = ''; $xml = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); if (count($xml->query->search->p) == 0) { return null; } $html .= htmlspecialchars('

    ', ENT_QUOTES).get_int_text("wiki_suggest", array($term)).htmlspecialchars('

    ', ENT_QUOTES); $html .= htmlspecialchars('

    ', ENT_QUOTES).get_int_text("wiki_suggest2").htmlspecialchars('

    ', ENT_QUOTES); $html .= htmlspecialchars('
    ", ENT_QUOTES); $html .= ''; $html .= ''.$domain.''.htmlspecialchars($term, ENT_QUOTES).''; return $html; } else { return ""; } } // ========================================================================== // // Artist Search // // ========================================================================== function getArtistWiki($artist_name, $disambig) { global $domain; // First, try a search and exact match in the user's chosen language. // This is to catch the case where a page exists on that user's wikipedia // domain and it has no language links to the en site if ($domain != "en") { $h = wikipedia_find_exact($artist_name, $domain); if ($h != '') { return $h; } } // Now try a search on the english site. We can be more wide-ranging in this search // we do this in English because (a) it has the most stuff and (b) I can speak it. // We can find translation links later. $h = wikipedia_artist_search($artist_name, $disambig); if ($h != '') { return $h; } // No results returned. If there's an '&' or 'and' or '+' in the name - such as 'Fruitbat & Umbrella' // try querying for 'Fruitbat' and 'Umbrella' separately and if there are any results, display them all $artist = preg_replace('/ and /', ' & ', $artist_name); $artist = preg_replace('/\+/', '&', $artist); $jhtml = array(); if (preg_match('/ & /', $artist) > 0) { $alist = explode(' & ', $artist); foreach ($alist as $artistname) { $j = wikipedia_artist_search($artistname, ""); if ($j != '') { $jhtml[] = $j; } } } elseif (preg_match('/,/', $artist) > 0) { $alist = explode(',', $artist); $jhtml = array(); foreach ($alist as $artistname) { $j = wikipedia_artist_search($artistname, ""); if ($j != '') { $jhtml[] = $j; } } } if (count($jhtml) > 0) { return join_responses($jhtml); } $h = find_dismbiguation_page($artist_name); if ($h != '') { return $h; } return wikipedia_get_list_of_suggestions($artist_name); } function wikipedia_artist_search($artist, $disambig) { $page = null; if ($disambig != "") { $searchfor = $artist.' ('.$disambig.')'; logger::log("WIKIPEDIA ARTIST", "Searching Wikipedia for ".$searchfor); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($searchfor) . '&srprop=score&format=xml'); $artistinfo = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); // First look for exact match foreach ($artistinfo->query->search->p as $id) { $searchstring = $id['title']; $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $searchfor)) { $page = $id['title']; break; } } if ($page == null) { $poss = array(); foreach ($artistinfo->query->search->p as $id) { if (preg_match('/\(.*?band\)|\(.*?musician\)|\(.*?singer\)/i', $id['title'])) { $poss[] = $id['title']; } } if (count($poss) == 1) { $page = array_shift($poss); } } } if ($page == null) { logger::log("WIKIPEDIA ARTIST", "Searching Wikipedia for ".$artist); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($artist) . '&srprop=score&format=xml'); $artist2info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); foreach ($artist2info->query->search->p as $id) { $searchstring = $id['title']; $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) { $page = $id['title']; break; } $poss = array(); if (preg_match('/\(.*?band\)|\(.*?musician\)|\(.*?singer\)/i', $id['title'])) { $poss[] = $id['title']; } if (count($poss) == 1) { $page = array_shift($poss); break; } $searchstring = $id['title']; $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', "The " . $artist)) { $page = $id['title']; break; } $searchstring = $id['title']; $searchstring = prepare_string($searchstring); if (preg_match('/^\s*The ' . $searchstring . '\s*$/i', $artist)) { $page = $id['title']; break; } if (preg_match('/&/', $id['title'])) { $searchstring = $id['title']; $searchstring = preg_replace( '/&/', 'and', $searchstring ); $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) { $page = $id['title']; break; } } if (preg_match('/and/', $id['title'])) { $searchstring = $id['title']; $searchstring = preg_replace( '/and/', '&', $searchstring ); $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) { $page = $id['title']; break; } } // Any '.'? Let's remove them (both ways round) if (preg_match('/\./', $id['title'])) { $searchstring = $id['title']; $searchstring = preg_replace( '/\./', '', $searchstring ); $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) { $page = $id['title']; break; } } if (preg_match('/\./', $artist)) { $searchstring = $id['title']; $t = preg_replace( '/\./', '', $artist ); $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $t)) { $page = $id['title']; break; } } // Words for numbers, numbers for words. $numbers = array('/1/','/2/','/3/','/4/','/5/','/6/','/7/','/8/','/9/'); $words = array("one", "two", "three", "four", "five", "six", "seven", "eight", "nine"); $searchstring = $id['title']; $searchstring = preg_replace( $numbers, $words, $searchstring); $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist) || preg_match('/^\s*' . $searchstring . '\s*$/i', "The ".$artist)) { $page = $id['title']; break; } $numbers = array('1','2','3','4','5','6','7','8','9'); $words = array("/one/", "/two/", "/three/", "/four/", "/five/", "/six/", "/seven/", "/eight/", "/nine/"); $searchstring = $id['title']; $searchstring = preg_replace( $words, $numbers, $searchstring); $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist) || preg_match('/^\s*' . $searchstring . '\s*$/i', "The ".$artist)) { $page = $id['title']; break; } } } if ($page == null && preg_match('/.*\(.*\).*/', $artist)) { $sf = trim(preg_replace('/\(.*?\)/','',$artist));; logger::log("WIKIPEDIA ARTIST", "Searching Wikipedia for ".$sf); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($sf) . '&srprop=score&format=xml'); $artist3info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); foreach ($artist3info->query->search->p as $id) { $searchstring = $id['title']; $searchstring = prepare_string($searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $sf)) { $page = $id['title']; break; } } } if ($page == null) { return ''; } logger::log("WIKIPEDIA ARTIST", "Artist search found page ".$page); return get_wikipedia_page(preg_replace('/ /', '_', $page), "en.wikipedia.org", true); } // ========================================================================== // // Album Search // // ========================================================================== function getAlbumWiki($album_name, $artist_name) { global $domain; // First, try a search and exact match in the user's chosen language. // This is to catch the case where a page exists on that user's wikipedia // domain and it has no language links to the en site if ($domain != "en") { $h = wikipedia_find_exact($album_name, $domain); if ($h != '') { return $h; } } // Now try a search on the english site. We can be more wide-ranging in this search // we do this in English because (a) it has the most stuff and (b) I can speak it. // We can find translation links later. $h = wikipedia_album_search($album_name, $artist_name); if ($h != '') { return $h; } $h = find_dismbiguation_page($album_name); if ($h != '') { return $h; } return wikipedia_get_list_of_suggestions($album_name); } function wikipedia_album_search($album, $artist) { $album = munge_album_name($album); logger::log("WIKIPEDIA ALBUM", "Searching Wikipedia for ".$album." (album)"); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($album." (album)") . '&srprop=score&format=xml'); $albuminfo = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); $page = null; foreach ($albuminfo->query->search->p as $id) { $searchstring = prepare_string($album).'\s+\('.prepare_string($artist).' album\)'; // logger::log("WIKIDEBUG", "1. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } if ($page == null) { foreach ($albuminfo->query->search->p as $id) { $searchstring = prepare_string($album).'\s+\(album\)'; // logger::log("WIKIDEBUG", "2. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } } if ($page == null) { foreach ($albuminfo->query->search->p as $id) { $searchstring = prepare_string($album).'\s+\(\d+ album\)'; // logger::log("WIKIDEBUG", "2. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } } if ($page == null) { foreach ($albuminfo->query->search->p as $id) { $searchstring = prepare_string($album); // logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } } if ($page == null) { logger::log("WIKIPEDIA ALBUM", "Searching Wikipedia for ".$album); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($album) . '&srprop=score&format=xml'); $album2info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); foreach ($album2info->query->search->p as $id) { $searchstring = prepare_string($album); // logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } } if ($page == null) { return null; } logger::log("WIKIPEDIA ALBUM", "Album search found page ".$page); return get_wikipedia_page(preg_replace('/ /', '_', $page), "en.wikipedia.org", true); } // ========================================================================== // // Track Search // // ========================================================================== function getTrackWiki($track_name, $artist_name) { global $domain; // First, try a search and exact match in the user's chosen language. // This is to catch the case where a page exists on that user's wikipedia // domain and it has no language links to the en site if ($domain != "en") { $h = wikipedia_find_exact($track_name, $domain); if ($h != '') { return $h; } } // Now try a search on the english site. We can be more wide-ranging in this search // we do this in English because (a) it has the most stuff and (b) I can speak it. // We can find translation links later. $h = wikipedia_track_search($track_name, $artist_name); if ($h != '') { return $h; } $h = find_dismbiguation_page($track_name); if ($h != '') { return $h; } return wikipedia_get_list_of_suggestions($track_name); } function wikipedia_track_search($track, $trackartist) { logger::log("WIKIPEDIA TRACK", "Searching Wikipedia for ".$track." (song) by ".$trackartist); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($track." (song)") . '&srprop=score&format=xml'); $albuminfo = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); // Comments assume the following: // track is 'A Track' // artist is 'An Artist' $page = null; // Look for 'A Track (An Artist song)' foreach ($albuminfo->query->search->p as $id) { $searchstring = prepare_string($track).'\s+\('.prepare_string($trackartist).' song\)'; // logger::log("WIKIDEBUG", "1. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } // Look for 'A Track (song)' if ($page == null) { foreach ($albuminfo->query->search->p as $id) { $searchstring = prepare_string($track).'\s+\(song\)'; // logger::log("WIKIDEBUG", "2. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } } // Look for 'A Track' if ($page == null) { foreach ($albuminfo->query->search->p as $id) { $searchstring = prepare_string($track); // logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } } if ($page == null) { logger::log("WIKIPEDIA TRACK", "Searching Wikipedia for ".$track); $xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($track) . '&srprop=score&format=xml'); $album2info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA); foreach ($album2info->query->search->p as $id) { $searchstring = prepare_string($track); // logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring); if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) { logger::log("WIKIPEDIA", "Found Page : ".$id['title']); $page = $id['title']; break; } } } if ($page == null) { return null; } logger::log("WIKIPEDIA TRACK", "Track search found page ".$page); return get_wikipedia_page(preg_replace('/ /', '_', $page), "en.wikipedia.org", true); } ?>