Getting all the links from a webpage
Here is the php code prepared by me to get all the links in a web page
//getting data from the url
$url='http://www.rmmfa2004.org';
$data=@file_get_contents($url);
//extracting the links
preg_match_all("/(href)\s*=\s*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/\?~=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $data, $matches);
$match1= $matches[2];
$match1=split("\r",$matches);
$linkss2 = array_unique($matches[2]);
$links= ($linkss2);
$links1=$links;
echo $linkss2[133];
//array to print links
echo"
"; print_r($matches[2]); echo"
“; //stripping slashes from the url $patternb=”/\/$/i”; $replace=”"; $url=preg_replace($patternb,$replace,$url); echo $url; echo “
“; $linkss2 = array_unique($links); $links=$linkss2; //counting the no of links $link2=array($links2); $count=sizeof($matches[2]); //Generating links for($i=0;$i<$count;$i++) { $links1=str_split($links[$i]); $links2= $links1[0].$links1[1].$links1[2].$links1[3]; $links3='http'; if(($links2)==$links3) { $link= $links[$i]; } else { if(($links1[0]!='/')) { $link= $url.'/'.$links[$i]; //To strip session ids $pattern="/\?(.*)/i"; preg_match($pattern,$link,$matchess); //echo $matches[1]; $length=strlen($matchess[1]); //echo $length; if($length>17) { $patterna=”/\?$matchess[0]/i”; $replace=”\t”; $link= preg_replace($patterna,$replace,$link); } else { } } else { $link=$url.$links[$i]; } } //printing the link echo $link; $link1=$link1.” “.$link; echo”
“; } //return($link1); $link1=explode(” “,$link1); $link1 = array_unique($link1); print_r($link1); echo”
“;
?>
