$url,
'cache' => 'wikipedia',
'return_data' => true
));
if ($d->get_data_to_file()) {
return $d->get_data();
} else {
return null;
}
}
function get_wikipedia_page($page, $site, $langsearch) {
// $page will be eg 'Air_(French_band)'
// $site will be eg 'en.wikipedia.org'
// $langsearch is true if we want to find a page in the user's language
// $domain is the language the user wants to use - eg 'fr'
global $domain;
global $userdomain;
global $mobile;
// $request_domain is the language of the page we've been asked to get
$r = preg_match("#(.*?)\.#", $site, $matches);
$request_domain = $matches[1];
$format_domain = $request_domain;
$req = "";
if ($langsearch) {
logger::log("WIKIPEDIA", "Request for page ".$page." from ".$site.". Domain is ".$request_domain." and user domain is ".$domain);
$user_link = ($request_domain == $domain) ? $page : null;
$english_link = ($site == "en.wikipedia.org") ? $page : null;
logger::log("WIKIPEDIA", "User Link is ".$user_link." and english link is ".$english_link);
if ($domain != $request_domain) {
logger::log("WIKIPEDIA", "Asked for page ".$page." from site ".$site." but user wants domain ".$domain);
// Find language links for the requested page
$langlinks = wikipedia_request("http://".$site."/w/api.php?action=query&prop=langlinks&titles=".$page."&format=xml");
if ($langlinks !== null) {
$langs = simplexml_load_string($langlinks);
if ($langs->query->pages->page->langlinks) {
foreach($langs->query->pages->page->langlinks->ll as $ll) {
$l = $ll['lang'];
$t = dom_import_simplexml($ll)->textContent;
logger::log("WIKIPEDIA", "Found language link ".$l." title ".$t);
if ($l == $domain) {
$user_link = preg_replace('/ /', '_', $t);
}
if ($l == "en" && $english_link == null) {
$english_link = preg_replace('/ /', '_', $t);
}
}
}
}
}
logger::log("WIKIPEDIA", "Language Scan Complete for ".$page);
logger::log("WIKIPEDIA", "User Link is ".$user_link." and english link is ".$english_link);
if ($user_link !== null) {
$format_domain = $domain;
$userdomain = true;
$page = $user_link;
$site = $domain.'.wikipedia.org';
} else if ($english_link !== null) {
$page = $english_link;
$site = "en.wikipedia.org";
$format_domain = "en";
}
}
if ($mobile) {
$req = 'http://'.$site.'/w/api.php?action=mobileview§ions=all&prop=text&page='.$page.'&format=xml';
} else {
$req = 'http://'.$site.'/w/api.php?action=parse&prop=text&page='.$page.'&format=xml';
}
$xml = wikipedia_request($req);
if ($xml !== null) {
$info = "";
if ($mobile) {
$info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
$reformat = '';
foreach($info->mobileview->sections->section as $section) {
$reformat .= htmlspecialchars($section, ENT_QUOTES);
}
$reformat .= ''.$format_domain.''.$page.'';
return $reformat;
} else {
$info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
$html = $info->parse->text;
$matches = array();
if (preg_match( '/REDIRECT addChild('rompr');
$meta->addChild('domain', $domain);
$meta->addChild('page', $page);
return $xml->asXML();
}
function join_responses($bits) {
$t = "";
$d = "";
$p = "";
foreach ($bits as $b) {
$info = simplexml_load_string($b, 'SimpleXMLElement', LIBXML_NOCDATA);
$t .= htmlspecialchars($info->parse->text, ENT_QUOTES);
$d = $info->rompr->domain;
$p = $info->rompr->page;
}
$reformat = ''.$t.''.$d.''.$p.'';
return $reformat;
}
function send_result($xml) {
header('Content-Type: text/xml');
print $xml;
}
function send_failure($term) {
$xml = '';
$xml .= htmlspecialchars('', ENT_QUOTES).get_int_text("wiki_fail", array($term)).htmlspecialchars('
', ENT_QUOTES);
$xml .= '';
$xml .= 'nullnull';
send_result($xml);
}
// ==========================================================================
//
// Utility Functions
//
// ==========================================================================
function prepare_string($searchstring) {
// Escape naughty characters
$searchstring = preg_replace( '/(\(|\)|\^|\$|\\\\|\/)/', '\\\\$1', $searchstring );
return $searchstring;
}
function wikipedia_find_exact($searchfor, $domain) {
$xml = wikipedia_request('http://'.$domain.'.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($searchfor) . '&srprop=score&format=xml');
if ($xml == null) {
return '';
}
$info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
$page = null;
// This is international, so we only look for an exact match (we can't possibly translate every possibility that's in artist_search, etc)
foreach ($info->query->search->p as $id) {
$searchstring = $id['title'];
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $searchfor)) {
$page = $id['title'];
break;
}
}
if ($page == null) {
return '';
} else {
return get_wikipedia_page(preg_replace('/ /', '_', $page), $domain.".wikipedia.org", false);
}
}
function find_dismbiguation_page($page) {
$searchfor = $page.' (disambiguation)';
logger::log("WIKIPEDIA", "Searching Wikipedia for ".$searchfor);
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($searchfor) . '&srprop=score&format=xml');
$results = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
foreach ($results->query->search->p as $id) {
if ($id['title'] == $searchfor) {
logger::log("WIKIPEDIA", "returning disambiguation page for ".$page);
return get_wikipedia_page(preg_replace('/ /', '_', $id['title']), "en.wikipedia.org", true);
}
}
return '';
}
function wikipedia_get_list_of_suggestions($term) {
global $domain;
logger::log("WIKIPEDIA", "Getting list of suggestions for ".$term." from ".$domain.".wikipedia.org");
$xml = wikipedia_request('http://'.$domain.'.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($term) . '&srprop=score&format=xml');
if ($xml != "") {
$html = '';
$xml = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
if (count($xml->query->search->p) == 0) {
return null;
}
$html .= htmlspecialchars('', ENT_QUOTES).get_int_text("wiki_suggest", array($term)).htmlspecialchars('
', ENT_QUOTES);
$html .= htmlspecialchars('', ENT_QUOTES).get_int_text("wiki_suggest2").htmlspecialchars('
', ENT_QUOTES);
$html .= htmlspecialchars('', ENT_QUOTES);
foreach ($xml->query->search->p as $id) {
$link = preg_replace('/\s/', '_', $id['title']);
$html .= htmlspecialchars('- '.$id['title'].'
', ENT_QUOTES);
}
$html .= htmlspecialchars("
", ENT_QUOTES);
$html .= '';
$html .= ''.$domain.''.htmlspecialchars($term, ENT_QUOTES).'';
return $html;
} else {
return "";
}
}
// ==========================================================================
//
// Artist Search
//
// ==========================================================================
function getArtistWiki($artist_name, $disambig) {
global $domain;
// First, try a search and exact match in the user's chosen language.
// This is to catch the case where a page exists on that user's wikipedia
// domain and it has no language links to the en site
if ($domain != "en") {
$h = wikipedia_find_exact($artist_name, $domain);
if ($h != '') {
return $h;
}
}
// Now try a search on the english site. We can be more wide-ranging in this search
// we do this in English because (a) it has the most stuff and (b) I can speak it.
// We can find translation links later.
$h = wikipedia_artist_search($artist_name, $disambig);
if ($h != '') {
return $h;
}
// No results returned. If there's an '&' or 'and' or '+' in the name - such as 'Fruitbat & Umbrella'
// try querying for 'Fruitbat' and 'Umbrella' separately and if there are any results, display them all
$artist = preg_replace('/ and /', ' & ', $artist_name);
$artist = preg_replace('/\+/', '&', $artist);
$jhtml = array();
if (preg_match('/ & /', $artist) > 0) {
$alist = explode(' & ', $artist);
foreach ($alist as $artistname) {
$j = wikipedia_artist_search($artistname, "");
if ($j != '') {
$jhtml[] = $j;
}
}
} elseif (preg_match('/,/', $artist) > 0) {
$alist = explode(',', $artist);
$jhtml = array();
foreach ($alist as $artistname) {
$j = wikipedia_artist_search($artistname, "");
if ($j != '') {
$jhtml[] = $j;
}
}
}
if (count($jhtml) > 0) {
return join_responses($jhtml);
}
$h = find_dismbiguation_page($artist_name);
if ($h != '') {
return $h;
}
return wikipedia_get_list_of_suggestions($artist_name);
}
function wikipedia_artist_search($artist, $disambig) {
$page = null;
if ($disambig != "") {
$searchfor = $artist.' ('.$disambig.')';
logger::log("WIKIPEDIA ARTIST", "Searching Wikipedia for ".$searchfor);
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($searchfor) . '&srprop=score&format=xml');
$artistinfo = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
// First look for exact match
foreach ($artistinfo->query->search->p as $id) {
$searchstring = $id['title'];
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $searchfor)) {
$page = $id['title'];
break;
}
}
if ($page == null) {
$poss = array();
foreach ($artistinfo->query->search->p as $id) {
if (preg_match('/\(.*?band\)|\(.*?musician\)|\(.*?singer\)/i', $id['title'])) {
$poss[] = $id['title'];
}
}
if (count($poss) == 1) {
$page = array_shift($poss);
}
}
}
if ($page == null) {
logger::log("WIKIPEDIA ARTIST", "Searching Wikipedia for ".$artist);
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($artist) . '&srprop=score&format=xml');
$artist2info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
foreach ($artist2info->query->search->p as $id) {
$searchstring = $id['title'];
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) {
$page = $id['title'];
break;
}
$poss = array();
if (preg_match('/\(.*?band\)|\(.*?musician\)|\(.*?singer\)/i', $id['title'])) {
$poss[] = $id['title'];
}
if (count($poss) == 1) {
$page = array_shift($poss);
break;
}
$searchstring = $id['title'];
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', "The " . $artist)) {
$page = $id['title'];
break;
}
$searchstring = $id['title'];
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*The ' . $searchstring . '\s*$/i', $artist)) {
$page = $id['title'];
break;
}
if (preg_match('/&/', $id['title'])) {
$searchstring = $id['title'];
$searchstring = preg_replace( '/&/', 'and', $searchstring );
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) {
$page = $id['title'];
break;
}
}
if (preg_match('/and/', $id['title'])) {
$searchstring = $id['title'];
$searchstring = preg_replace( '/and/', '&', $searchstring );
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) {
$page = $id['title'];
break;
}
}
// Any '.'? Let's remove them (both ways round)
if (preg_match('/\./', $id['title'])) {
$searchstring = $id['title'];
$searchstring = preg_replace( '/\./', '', $searchstring );
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist)) {
$page = $id['title'];
break;
}
}
if (preg_match('/\./', $artist)) {
$searchstring = $id['title'];
$t = preg_replace( '/\./', '', $artist );
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $t)) {
$page = $id['title'];
break;
}
}
// Words for numbers, numbers for words.
$numbers = array('/1/','/2/','/3/','/4/','/5/','/6/','/7/','/8/','/9/');
$words = array("one", "two", "three", "four", "five", "six", "seven", "eight", "nine");
$searchstring = $id['title'];
$searchstring = preg_replace( $numbers, $words, $searchstring);
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist) ||
preg_match('/^\s*' . $searchstring . '\s*$/i', "The ".$artist)) {
$page = $id['title'];
break;
}
$numbers = array('1','2','3','4','5','6','7','8','9');
$words = array("/one/", "/two/", "/three/", "/four/", "/five/", "/six/", "/seven/", "/eight/", "/nine/");
$searchstring = $id['title'];
$searchstring = preg_replace( $words, $numbers, $searchstring);
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $artist) ||
preg_match('/^\s*' . $searchstring . '\s*$/i', "The ".$artist)) {
$page = $id['title'];
break;
}
}
}
if ($page == null && preg_match('/.*\(.*\).*/', $artist)) {
$sf = trim(preg_replace('/\(.*?\)/','',$artist));;
logger::log("WIKIPEDIA ARTIST", "Searching Wikipedia for ".$sf);
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($sf) . '&srprop=score&format=xml');
$artist3info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
foreach ($artist3info->query->search->p as $id) {
$searchstring = $id['title'];
$searchstring = prepare_string($searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $sf)) {
$page = $id['title'];
break;
}
}
}
if ($page == null) {
return '';
}
logger::log("WIKIPEDIA ARTIST", "Artist search found page ".$page);
return get_wikipedia_page(preg_replace('/ /', '_', $page), "en.wikipedia.org", true);
}
// ==========================================================================
//
// Album Search
//
// ==========================================================================
function getAlbumWiki($album_name, $artist_name) {
global $domain;
// First, try a search and exact match in the user's chosen language.
// This is to catch the case where a page exists on that user's wikipedia
// domain and it has no language links to the en site
if ($domain != "en") {
$h = wikipedia_find_exact($album_name, $domain);
if ($h != '') {
return $h;
}
}
// Now try a search on the english site. We can be more wide-ranging in this search
// we do this in English because (a) it has the most stuff and (b) I can speak it.
// We can find translation links later.
$h = wikipedia_album_search($album_name, $artist_name);
if ($h != '') {
return $h;
}
$h = find_dismbiguation_page($album_name);
if ($h != '') {
return $h;
}
return wikipedia_get_list_of_suggestions($album_name);
}
function wikipedia_album_search($album, $artist) {
$album = munge_album_name($album);
logger::log("WIKIPEDIA ALBUM", "Searching Wikipedia for ".$album." (album)");
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($album." (album)") . '&srprop=score&format=xml');
$albuminfo = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
$page = null;
foreach ($albuminfo->query->search->p as $id) {
$searchstring = prepare_string($album).'\s+\('.prepare_string($artist).' album\)';
// logger::log("WIKIDEBUG", "1. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
if ($page == null) {
foreach ($albuminfo->query->search->p as $id) {
$searchstring = prepare_string($album).'\s+\(album\)';
// logger::log("WIKIDEBUG", "2. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
}
if ($page == null) {
foreach ($albuminfo->query->search->p as $id) {
$searchstring = prepare_string($album).'\s+\(\d+ album\)';
// logger::log("WIKIDEBUG", "2. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
}
if ($page == null) {
foreach ($albuminfo->query->search->p as $id) {
$searchstring = prepare_string($album);
// logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
}
if ($page == null) {
logger::log("WIKIPEDIA ALBUM", "Searching Wikipedia for ".$album);
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($album) . '&srprop=score&format=xml');
$album2info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
foreach ($album2info->query->search->p as $id) {
$searchstring = prepare_string($album);
// logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
}
if ($page == null) {
return null;
}
logger::log("WIKIPEDIA ALBUM", "Album search found page ".$page);
return get_wikipedia_page(preg_replace('/ /', '_', $page), "en.wikipedia.org", true);
}
// ==========================================================================
//
// Track Search
//
// ==========================================================================
function getTrackWiki($track_name, $artist_name) {
global $domain;
// First, try a search and exact match in the user's chosen language.
// This is to catch the case where a page exists on that user's wikipedia
// domain and it has no language links to the en site
if ($domain != "en") {
$h = wikipedia_find_exact($track_name, $domain);
if ($h != '') {
return $h;
}
}
// Now try a search on the english site. We can be more wide-ranging in this search
// we do this in English because (a) it has the most stuff and (b) I can speak it.
// We can find translation links later.
$h = wikipedia_track_search($track_name, $artist_name);
if ($h != '') {
return $h;
}
$h = find_dismbiguation_page($track_name);
if ($h != '') {
return $h;
}
return wikipedia_get_list_of_suggestions($track_name);
}
function wikipedia_track_search($track, $trackartist) {
logger::log("WIKIPEDIA TRACK", "Searching Wikipedia for ".$track." (song) by ".$trackartist);
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($track." (song)") . '&srprop=score&format=xml');
$albuminfo = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
// Comments assume the following:
// track is 'A Track'
// artist is 'An Artist'
$page = null;
// Look for 'A Track (An Artist song)'
foreach ($albuminfo->query->search->p as $id) {
$searchstring = prepare_string($track).'\s+\('.prepare_string($trackartist).' song\)';
// logger::log("WIKIDEBUG", "1. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
// Look for 'A Track (song)'
if ($page == null) {
foreach ($albuminfo->query->search->p as $id) {
$searchstring = prepare_string($track).'\s+\(song\)';
// logger::log("WIKIDEBUG", "2. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
}
// Look for 'A Track'
if ($page == null) {
foreach ($albuminfo->query->search->p as $id) {
$searchstring = prepare_string($track);
// logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
}
if ($page == null) {
logger::log("WIKIPEDIA TRACK", "Searching Wikipedia for ".$track);
$xml = wikipedia_request('http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' . rawurlencode($track) . '&srprop=score&format=xml');
$album2info = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
foreach ($album2info->query->search->p as $id) {
$searchstring = prepare_string($track);
// logger::log("WIKIDEBUG", "3. Checking page ".$id['title']." against ".$searchstring);
if (preg_match('/^\s*' . $searchstring . '\s*$/i', $id['title'])) {
logger::log("WIKIPEDIA", "Found Page : ".$id['title']);
$page = $id['title'];
break;
}
}
}
if ($page == null) {
return null;
}
logger::log("WIKIPEDIA TRACK", "Track search found page ".$page);
return get_wikipedia_page(preg_replace('/ /', '_', $page), "en.wikipedia.org", true);
}
?>