|
<%@ Import Namespace="System" %> <%@ Import Namespace="System.IO" %> <%@ Import Namespace="System.Xml" %> <%@ Import Namespace="System.Net" %> <script runat="server"> Dim discussionGroupID As String = "12345678901" ' Facebook Group ID Dim discussionGroupPage As String = "http://www.facebook.com/board.php?uid=" & discussionGroupID ' URL to Group Discussion Board Dim rssCacheClear As Integer = 0 ' 1 = clear server-cache ' RSS Content Provider Data Dim postLink As String = "topic" ' "topic" = put link to topic in <link></link> of RSS feed, "thumbnail" = put link to thumbnail in <link></link> of RSS feed, "post" = put link to the poster's post Dim rssDescriptionLen As Integer = 60 ' Number of characters to allow in the poster's post (description) Dim rssIncludeHeader As Integer = 0 ' 0 = do not include RSS provider data (below) meaning the feed can be interpreted as regular XML, 1 = include to have interpreted as RSS Dim feed_title As String = "FEED TITLE" Dim feed_link As String = "http://www.facebook.com/PUBLIC-GROUP/" Dim feed_description As String = "FEED DESCRIPTION" Dim feed_language As String = "en-us" Dim feed_pubdate As String = "11 Apr 2001 01:01:00 GMT" Dim feed_copyright As String = "" Dim feed_webmaster As String = "WEBMASTER@YOUREMAIL.COM" ' Do Not Use Dim discussionPageData As String = Nothing Dim crawlAgent As String = Nothing Dim crawlError As String = Nothing ' TRAP CONNECT ERRORS IN CRAWLPAGE - SEND TO GLOBAL FOR CHECKING Private Function crawlPage(ByVal URL As String) As String Dim buffSize As Integer = 2048 Dim crawlOutput As String = Nothing Dim crawlMethod As String = "GET" Dim crawlURL As String = URL Try Dim myRequest As HttpWebRequest = CType(WebRequest.Create(crawlURL), HttpWebRequest) myRequest.UserAgent = crawlAgent myRequest.Method = crawlMethod Dim myResponse As HttpWebResponse = CType(myRequest.GetResponse(), HttpWebResponse) Dim streamResponse As Stream = myResponse.GetResponseStream() Dim streamRead As New StreamReader(streamResponse) Dim readBuff(buffSize) As [Char] Dim lineStep As Integer = streamRead.Read(readBuff, 0, buffSize) While lineStep > 0 Dim outputData As New [String](readBuff, 0, lineStep) crawlOutput = crawlOutput & outputData lineStep = streamRead.Read(readBuff, 0, buffSize) End While streamRead.Close() streamResponse.Close() myResponse.Close() Catch ex As Exception crawlError = Server.HtmlEncode(ex.Message) End Try Return (crawlOutput) End Function Private Function translateTimeStamp(ByVal rawTimeStamp As String) As String Dim filteredTimeStamp As String = Nothing Dim useMonth As String = Nothing Dim useDay As String = Nothing Dim useYear As String = Nothing Dim useHour As String = Nothing Dim useMinute As String = Nothing Dim useSeconds As String = "00" Dim tmpDate As String = LCase(CStr(Replace(Split(Split(rawTimeStamp, "on")(1), "at")(0), ",", ""))).Trim Dim tmpTime As String = Replace(Split(rawTimeStamp, "at")(1), " ", "") Dim dateSegments() As String = Split(tmpDate, " ") Dim timeSegments() As String = Split(tmpTime, ":") If CStr(dateSegments(0)) = "january" Then useMonth = "Jan" ElseIf CStr(dateSegments(0)) = "february" Then useMonth = "Feb" ElseIf CStr(dateSegments(0)) = "march" Then useMonth = "Mar" ElseIf CStr(dateSegments(0)) = "april" Then useMonth = "Apr" ElseIf CStr(dateSegments(0)) = "may" Then useMonth = "May" ElseIf CStr(dateSegments(0)) = "june" Then useMonth = "Jun" ElseIf CStr(dateSegments(0)) = "july" Then useMonth = "Jul" ElseIf CStr(dateSegments(0)) = "august" Then useMonth = "Aug" ElseIf CStr(dateSegments(0)) = "september" Then useMonth = "Sep" ElseIf CStr(dateSegments(0)) = "october" Then useMonth = "Oct" ElseIf CStr(dateSegments(0)) = "november" Then useMonth = "Nov" ElseIf CStr(dateSegments(0)) = "december" Then useMonth = "Dec" End If If CInt(dateSegments(1)) < 10 Then useDay = CStr("0" & dateSegments(1)) Else useDay = CStr(dateSegments(1)) End If useYear = CStr(dateSegments(2)) If InStr(timeSegments(1), "am", vbTextCompare) > 0 Then useHour = "0" & timeSegments(0) Else useHour = 12 + CInt(timeSegments(0)) End If useMinute = Replace(Replace(timeSegments(1), "pm", ""), "am", "") filteredTimeStamp = useDay & " " & useMonth & " " & useYear & " " & useHour & ":" & useMinute & ":" & useSeconds & " GMT" Return (filteredTimeStamp) End Function Private Sub Page_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) ' Get User's Agent Data crawlAgent = Request.UserAgent ' Querystring Data Dim feedName As String = CStr(Request.QueryString("name")) ' Name of the feed (this determines the name of the server cache) If Len(feedName) = 0 Then : feedName = "default" : End If Dim maxFeedsTotalAccept As Integer = CInt(Request.QueryString("feedNumber")) ' Maximum number of feeds to return If maxFeedsTotalAccept <= 0 Then : maxFeedsTotalAccept = 10 : End If ' Do Not Use Dim rssFeed As String = Nothing Dim feedContent As String = Nothing If rssCacheClear = 0 Then ' Build Fresh RSS If HttpContext.Current.Cache(feedName) Is Nothing Then ' Load Main Group Discussion Page discussionPageData = crawlPage(discussionGroupPage) If Len(crawlError) <= 0 Then ' Extract Last Poster of each Topic Dim elemLines() As String = Split(discussionPageData, ">") Dim postBlockFound As Integer = 0 Dim postURL As String = Nothing Dim topicTitle As String = Nothing Dim topicTitleURL As String = Nothing Dim postTimeStamp As String = Nothing Dim postUserName As String = Nothing Dim postUserThumbnail As String = Nothing Dim postUserContent As String = Nothing Dim groupPostData(,) As String Dim groupPostDataSize As Integer = 0 For S As Integer = 0 To UBound(elemLines) If postBlockFound = 8 Then postBlockFound = 0 postTimeStamp = translateTimeStamp(Split(elemLines(S), "</div")(0)) ' Save ReDim Preserve groupPostData(6, groupPostDataSize) groupPostData(0, groupPostDataSize) = postURL groupPostData(1, groupPostDataSize) = topicTitle groupPostData(2, groupPostDataSize) = topicTitleURL groupPostData(3, groupPostDataSize) = postTimeStamp groupPostData(4, groupPostDataSize) = postUserName groupPostData(5, groupPostDataSize) = postUserThumbnail groupPostData(6, groupPostDataSize) = postUserContent groupPostDataSize = groupPostDataSize + 1 ' Clear postURL = "" : topicTitle = "" : topicTitleURL = "" : postTimeStamp = "" : postUserName = "" : postUserThumbnail = "" : postUserContent = "" End If If postBlockFound = 7 Then If InStr(elemLines(S), "class=""topic_pager""", vbTextCompare) > 0 Then postBlockFound = 8 End If End If If postBlockFound = 6 Then postBlockFound = 7 : postUserName = Split(elemLines(S), "</span")(0) End If If postBlockFound = 5 Then If InStr(elemLines(S), "by <span", vbTextCompare) - 1 > -1 Then postBlockFound = 6 End If End If If postBlockFound = 4 Then postBlockFound = 5 : postURL = Replace(Split(elemLines(S), """")(1), "amp;", "&") End If If postBlockFound = 3 Then If InStr(elemLines(S), "class=""post_user""", vbTextCompare) > 0 Then postBlockFound = 4 End If End If If postBlockFound = 2 Then postBlockFound = 3 topicTitle = Split(elemLines(S), "</")(0).Trim topicTitle = Replace(topicTitle, "<br>", " ") topicTitle = Replace(topicTitle, "<br />", " ") topicTitle = Replace(topicTitle, "<BR>", " ") topicTitle = Replace(topicTitle, "<BR />", " ") topicTitle = Replace(topicTitle, "<Br>", " ") topicTitle = Replace(topicTitle, "<Br />", " ") topicTitle = Replace(topicTitle, "<bR>", " ") topicTitle = Replace(topicTitle, "<bR />", " ") ' Filter Out Remaining Tags Dim tmp_titleFiltered As String = Nothing Dim flipTitleIndex As Integer = 0 For T As Integer = 0 To Len(topicTitle) Dim tmp_topicChar As String = Mid(topicTitle, T + 1, 1) If tmp_topicChar = "<" Then : flipTitleIndex = 1 : End If If flipTitleIndex = 0 Then : tmp_titleFiltered = tmp_titleFiltered & tmp_topicChar : End If If tmp_topicChar = ">" Then : flipTitleIndex = 0 : End If Next topicTitle = tmp_titleFiltered If Len(topicTitle) > rssDescriptionLen Then topicTitle = Mid(topicTitle, 1, rssDescriptionLen) & "..." End If End If If postBlockFound = 1 Then postBlockFound = 2 : topicTitleURL = Split(elemLines(S), """")(1) End If If InStr(elemLines(S), "class=""topic_title_datawrap""", vbTextCompare) > 0 Or InStr(elemLines(S), "class=""topic_title datawrap""", vbTextCompare) > 0 And postBlockFound = 0 Then : postBlockFound = 1 : End If Next If groupPostDataSize > 0 Then postBlockFound = 0 : Erase elemLines For X As Integer = 0 To groupPostDataSize - 1 postURL = groupPostData(0, X) topicTitle = groupPostData(1, X) topicTitleURL = groupPostData(2, X) postTimeStamp = groupPostData(3, X) postUserName = groupPostData(4, X) postUserThumbnail = groupPostData(5, X) postUserContent = groupPostData(6, X) ' Load Poster Page discussionPageData = crawlPage(postURL) If Len(crawlError) > 0 Then Exit For End If elemLines = Split(discussionPageData, ">") Dim tmpDiscussionData As String = Nothing For Z As Integer = 0 To UBound(elemLines) If postBlockFound = 6 Then postBlockFound = 0 postUserContent = Replace(postUserContent, "<br>", " ") postUserContent = Replace(postUserContent, "<br />", " ") postUserContent = Replace(postUserContent, "<BR>", " ") postUserContent = Replace(postUserContent, "<BR />", " ") postUserContent = Replace(postUserContent, "<Br>", " ") postUserContent = Replace(postUserContent, "<Br />", " ") postUserContent = Replace(postUserContent, "<bR>", " ") postUserContent = Replace(postUserContent, "<bR />", " ") ' Filter Out Remaining Tags Dim tmp_Filtered As String = Nothing Dim flipIndex As Integer = 0 For Q As Integer = 0 To Len(postUserContent) Dim tmp_Char As String = Mid(postUserContent, Q + 1, 1) If tmp_Char = "<" Then : flipIndex = 1 : End If If flipIndex = 0 Then : tmp_Filtered = tmp_Filtered & tmp_Char : End If If tmp_Char = ">" Then : flipIndex = 0 : End If Next postUserContent = tmp_Filtered If Len(postUserContent) > rssDescriptionLen Then postUserContent = Mid(postUserContent, 1, rssDescriptionLen) & "..." End If groupPostData(5, X) = postUserThumbnail groupPostData(6, X) = postUserContent postUserThumbnail = "" : postUserContent = "" End If If postBlockFound = 5 Then If InStr(elemLines(Z), "<", vbTextCompare) - 1 > -1 Then elemLines(Z) = elemLines(Z) & ">" End If If InStr(elemLines(Z), "</div>", vbTextCompare) - 1 > -1 Then postBlockFound = 6 If Len(tmpDiscussionData) = 0 Then : tmpDiscussionData = elemLines(Z) : End If postUserContent = tmpDiscussionData.Trim tmpDiscussionData = "" Else tmpDiscussionData = tmpDiscussionData & elemLines(Z) End If End If If postBlockFound = 4 Then If InStr(elemLines(Z), postUserName & "</strong", vbTextCompare) - 1 > -1 Then postBlockFound = 5 Else postBlockFound = 0 End If End If If postBlockFound = 3 Then : postBlockFound = 4 : End If If postBlockFound = 2 Then If InStr(elemLines(Z), "class=""uiStreamMessagembs""", vbTextCompare) > 0 Or InStr(elemLines(Z), "class=""uiStreamMessage mbs""", vbTextCompare) > 0 Then postBlockFound = 3 End If End If If postBlockFound = 1 Then postBlockFound = 2 : postUserThumbnail = Split(Replace(Split(elemLines(Z), "src=")(1), """", ""), " alt=")(0) End If If InStr(elemLines(Z), "class=""UIImageBlock_Image_UIImageBlock_MED_Image""", vbTextCompare) > 0 Or InStr(elemLines(Z), "class=""UIImageBlock_Image UIImageBlock_MED_Image""", vbTextCompare) > 0 And postBlockFound = 0 Then : postBlockFound = 1 : End If Next Next ' Assemble Data Into RSS or XML Dim feedStep As Integer = 0 For R As Integer = 0 To groupPostDataSize - 1 postURL = groupPostData(0, R) topicTitle = groupPostData(1, R) topicTitleURL = groupPostData(2, R) postTimeStamp = groupPostData(3, R) postUserName = groupPostData(4, R) postUserThumbnail = groupPostData(5, R) postUserContent = groupPostData(6, R) if Len(postUserContent) = 0 Then : postUserContent = topicTitle : End if feedStep = feedStep + 1 If feedStep <= maxFeedsTotalAccept Then feedContent = feedContent & "<item>" & vbCrLf feedContent = feedContent & "<title>" & postUserContent & "</title>" & vbCrLf If LCase(postLink) = "thumbnail" Then feedContent = feedContent & "<link>" & postUserThumbnail & "</link>" & vbCrLf ElseIf LCase(postLink) = "topic" Then feedContent = feedContent & "<link>" & topicTitleURL & "</link>" & vbCrLf Else feedContent = feedContent & "<link>" & postURL & "</link>" & vbCrLf End If feedContent = feedContent & "<description>" & Server.HtmlEncode(postUserContent) & "</description>" & vbCrLf feedContent = feedContent & "<author>" & Server.HtmlEncode(postUserName) & "</author>" & vbCrLf feedContent = feedContent & "<date>" & postTimeStamp & "</date>" & vbCrLf feedContent = feedContent & "</item>" & vbCrLf End If Next Else ' Nothing Found If Len(crawlError) <= 0 Then feedContent = feedContent & "<item>" & vbCrLf feedContent = feedContent & "<title>Page Found But Data Not Recognized</title>" & vbCrLf feedContent = feedContent & "<link></link>" & vbCrLf feedContent = feedContent & "<description>Page Found But Data Not Recognized</description>" & vbCrLf feedContent = feedContent & "<author></author>" & vbCrLf feedContent = feedContent & "<date>01 Jan 1900 00:00:01 GMT</date>" & vbCrLf feedContent = feedContent & "</item>" & vbCrLf End If End If ' END groupPostDataSize > 0 If Len(crawlError) <= 0 Then ' Compile RSS Data rssFeed = rssFeed & "<rss version=""2.0"">" & vbCrLf rssFeed = rssFeed & "<channel>" & vbCrLf If rssIncludeHeader = 1 Then rssFeed = rssFeed & "<title>" & feed_title & "</title>" & vbCrLf rssFeed = rssFeed & "<link>" & feed_link & "</link>" & vbCrLf rssFeed = rssFeed & "<description>" & feed_description & "</description>" & vbCrLf rssFeed = rssFeed & "<language>" & feed_language & "</language>" & vbCrLf rssFeed = rssFeed & "<date>" & feed_pubdate & "</date>" & vbCrLf rssFeed = rssFeed & "<copyright>" & feed_copyright & "</copyright>" & vbCrLf rssFeed = rssFeed & "<webmaster>" & feed_webmaster & "</webmaster>" & vbCrLf End If rssFeed = rssFeed & feedContent rssFeed = rssFeed & "</channel>" & vbCrLf rssFeed = rssFeed & "</rss>" ' Save Into Server Cache HttpContext.Current.Cache.Add(feedName, rssFeed, Nothing, DateTime.Now.AddDays(1), System.Web.Caching.Cache.NoSlidingExpiration, CacheItemPriority.Normal, Nothing) End If Else ' No Data End If ' END Len(crawlError) <= 0 ' Error With Contacting Target. Index Local RSS Copy. If Len(crawlError) > 0 Then Dim baseFolderName As String = feedName Dim currentFolder As String = Server.MapPath(".") currentFolder = Replace(currentFolder & "\rssCopy.rss") Dim objFSO = Server.CreateObject("Scripting.FilesystemObject") If objFSO.fileExists(currentFolder) = True Then Dim augment = objFSO.OpenTextFile(currentFolder) rssFeed = augment.readAll augment.Close() End If End If Else ' Use RSS From Server Cache rssFeed = CType(HttpContext.Current.Cache(feedName), String) End If ' END HttpContext.Current.Cache(feedName) Is Nothing Else ' Clear Server Cache HttpContext.Current.Cache.Remove(feedName) ' Compile RSS Data feedContent = feedContent & "<item>" & vbCrLf feedContent = feedContent & "<title>Server Cache Cleared</title>" & vbCrLf feedContent = feedContent & "<link></link>" & vbCrLf feedContent = feedContent & "<description>Server Cache Cleared</description>" & vbCrLf feedContent = feedContent & "<author></author>" & vbCrLf feedContent = feedContent & "<date>01 Jan 1900 00:00:01 GMT</date>" & vbCrLf feedContent = feedContent & "</item>" & vbCrLf rssFeed = rssFeed & "<rss version=""2.0"">" & vbCrLf rssFeed = rssFeed & "<channel>" & vbCrLf If rssIncludeHeader = 1 Then rssFeed = rssFeed & "<title>" & feed_title & "</title>" & vbCrLf rssFeed = rssFeed & "<link>" & feed_link & "</link>" & vbCrLf rssFeed = rssFeed & "<description>" & feed_description & "</description>" & vbCrLf rssFeed = rssFeed & "<language>" & feed_language & "</language>" & vbCrLf rssFeed = rssFeed & "<date>" & feed_pubdate & "</date>" & vbCrLf rssFeed = rssFeed & "<copyright>" & feed_copyright & "</copyright>" & vbCrLf rssFeed = rssFeed & "<webmaster>" & feed_webmaster & "</webmaster>" & vbCrLf End If rssFeed = rssFeed & feedContent rssFeed = rssFeed & "</channel>" & vbCrLf rssFeed = rssFeed & "</rss>" End If ' END rssCacheClear = 0 ' Generate Output Response.Buffer = False If rssIncludeHeader = 1 Then Response.ContentType = "application/rss+xml" Else Response.ContentType = "text/xml" End if Response.Write("<" & "?" & "xml version=""1.0"" encoding=""utf-8""" & "?" & ">" & vbCrLf) Response.Write(rssFeed) End Sub </script> |
|
#!/usr/bin/perl -w ######################################################## # Facebook Public Group Discussion Board RSS Generator # ######################################################## ##### Declare Container For Response Stream local our $captured_response; ##### Variable Arguments Passed Into Bot Engine local our $port, $agent_name, $agent_method, $target_domain, $target_full_address, $request_method, $request_protocol; ##### Other local our $captured_response, $captured_response_filtered, $record, $collectchars, $absPath, $absPathTick, $absPathQuote; local our $rssDescriptionLen, $maxFeedsTotalAccept, $feedContent, $postLink, $rssIncludeHeader; local our $feed_title, $feed_link, $feed_description, $feed_language, $feed_pubdate, $feed_copyright, $feed_webmaster; ##### Connection Settings $target_domain = "www.facebook.com"; # Site to contact $target_full_address = "/board.php?uid=12345678901"; # Full path to public group discussion board $port = 80; $agent_name = $ENV{HTTP_USER_AGENT}; $agent_method = "http-get/0.1"; $request_method = "GET"; $request_protocol = "HTTP/1.0"; ##### RSS Settings $rssDescriptionLen = 22; # Maximum number of characters to allow in the title $maxFeedsTotalAccept = 6; # Maximum number of links to get $postLink = "thumbnail"; # "thumbnail" - URL to thumbnail image, "topic" - URL to topic, "post" - URL to post $rssIncludeHeader = 1; # 1 - include channel information (RSS), 0 - leave channel information out (XML) $feed_title = "FEED TITLE"; $feed_link = "http://www.facebook.com/PUBLIC-GROUP/"; $feed_description = "FEED DESCRIPTION"; $feed_language = "en-us"; $feed_pubdate = "11 Apr 2001 01:01:00 GMT"; $feed_copyright = ""; $feed_webmaster = "WEBMASTER\@YOUREMAIL.COM"; ##### Date Filter sub translateTimeStamp() { my $filteredTimeStamp, $tmpDate, $useMonth, $useDay, $useYear, $useHour, $useMinute, $useSeconds; $useSeconds = "00"; @tmpDate1 = (); @tmpDate2 = (); @dateSegments = (); @timeSegments = (); # Parse Arguments Array (variables being passed into this subroutine) my ($rawdate) = @_; @tmpDate1 = split(/on\s/, $rawdate); @tmpDate2 = split(/\sat\s/, $tmpDate1[1]); $tmpDate2[0] =~ s/\,//g; @dateSegments = split(/\s/, $tmpDate2[0]); $useMonth = substr($dateSegments[0], 0, 3); if ($dateSegments[1] < 10) { $useDay = "0" . $dateSegments[1]; } else { $useDay = $dateSegments[1]; } $useYear = $dateSegments[2]; @timeSegments = split(/\:/, $tmpDate2[1]); if ($timeSegments[1] =~ /am/i) { $useHour = "0" . $timeSegments[0]; } else { $useHour = 12 + $timeSegments[0]; } $useMinute = $timeSegments[1]; $useMinute =~ s/am//g; $useMinute =~ s/pm//g; $filteredTimeStamp = $useDay . " " . $useMonth . " " . $useYear . " " . $useHour . ":" . $useMinute . ":" . $useSeconds . "GMT"; return $filteredTimeStamp; } ##### The Crawler sub botEngine() { # Define Some Local (Private) Variables my $remote, $data_result, $error, $volume, $datastream, $len; # Parse Arguments Array (variables being passed into this subroutine) my ($port, $agent_name, $agent_method, $target_domain, $target_full_address, $request_method, $request_protocol) = @_; # Define Socket Connection sub TCP { join("", getprotobyname('tcp')); } sub SOCK_STREAM { 1; } sub AF_INET { 2; } sub PF_INET { &AF_INET; } # Build Identification Header @Headers = ("User-Agent:$agent_name", $agent_method); # Resolve Domain To IP Address Using IPv4 if ($target_domain =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/) { @addrs = pack('C4', split(/\./,$target_domain)); } else { ($dummy,$dummy,$dummy,$dummy, @addrs) = gethostbyname($target_domain); } $remote = pack("S n a4 x8", &AF_INET, $port, $addrs[0]); # Open Socket Connection unless (socket(S, &PF_INET, &SOCK_STREAM, &TCP)) { $error = 1; $data_result = "Socket Connection Error 1"; } # Send Request. Gather Response Stream. if ($error == 0) { unless (connect(S, $remote)) { $error = 1; $data_result = "Socket Connection Error 1 - Attempt 2"; } if ($error == 0) { select(S); $| = 1; select(STDOUT); $| = 1; $request = "$request_method $target_full_address $request_protocol\r\n"; while ($#Headers > 0) { $request = $request . "$Headers[0]: $Headers[1]\r\n"; shift(@Headers); shift(@Headers); } $request = $request . "\r\n"; print(S $request); $volume = 1024*1024; while ($len = sysread(S, $datastream, $volume)) { $captured_response .= $datastream; } } } # Close Socket Connection. unless (close(S)) { $data_result = "Socket Close Error 1"; } else { $data_result = "Connection Successful"; } # Return Status Event Response. return $data_result; } $filename = "$ENV{'DOCUMENT_ROOT'}/rssDay.txt"; $dayNumber = (localtime)[7]; $foundDay = -1; if (-e $filename) { open (FILE, "$filename"); $foundDay = <FILE>; close (FILE); } if ($dayNumber != $foundDay) { ##### Invoke Crawler if (length($target_domain) > 0 && length($target_full_address) > 0) { $response = &botEngine($port, $agent_name, $agent_method, $target_domain, $target_full_address, $request_method, $request_protocol); } if ($response eq "Connection Successful") { ##### Filter Content $captured_response_filtered = $captured_response; $captured_response_filtered =~ s/\r\n//g; $captured_response_filtered =~ s/\n//g; $captured_response_filtered =~ s/\r//g; if ($captured_response_filtered =~ /(.*)<body(.*)<\/body/i) { $captured_response_filtered = $2; } ##### Extract Content @lines = split(/>/, $captured_response_filtered); $captured_response_filtered = ""; @groupPostData = (); $groupPostDataCount = 0; local our $postBlockFound; $postBlockFound = 0; local our $topicTitleURL; $topicTitleURL = ""; local our $topicTitle; $topicTitle = ""; local our $postURL; $postURL = ""; local our $postUserName; $postUserName = ""; local our $postTimeStamp; $postTimeStamp = ""; local our $postUserThumbnail; $postUserThumbnail = ""; local our $postUserContent; $postUserContent = ""; foreach $line (@lines) { if ($postBlockFound == 8) { $postBlockFound = 0; @tmp = split(/<\/div/, $line); $postTimeStamp = $tmp[0]; # Gather Output $groupPostData[$groupPostDataCount][0] = $topicTitleURL; $groupPostData[$groupPostDataCount][1] = $topicTitle; $groupPostData[$groupPostDataCount][2] = $postURL; $groupPostData[$groupPostDataCount][3] = $postUserName; $groupPostData[$groupPostDataCount][4] = &translateTimeStamp($postTimeStamp); $groupPostData[$groupPostDataCount][5] = $postUserThumbnail; $groupPostData[$groupPostDataCount][6] = $postUserContent; $groupPostDataCount = $groupPostDataCount + 1; # Clear $postURL = ""; $topicTitle = ""; $topicTitleURL = ""; $postTimeStamp = ""; $postUserName = ""; } if ($postBlockFound == 7) { if ($line =~ /class=\"topic_pager\"/i) { $postBlockFound = 8; } } if ($postBlockFound == 6) { $postBlockFound = 7; @tmp = split(/<\/span/, $line); $postUserName = $tmp[0]; } if ($postBlockFound == 5) { if ($line =~ /by <span/i) { $postBlockFound = 6; } } if ($postBlockFound == 4) { $postBlockFound = 5; @tmp = split(/\"/, $line); $postURL = $tmp[1]; $postURL =~ s/&/&/g; } if ($postBlockFound == 3) { if ($line =~ /class=\"post_user\"/i) { $postBlockFound = 4; } } if ($postBlockFound == 2) { $postBlockFound = 3; @tmp = split(/<\//, $line); my $tmp_titleFiltered = ""; my $flipTitleIndex = 0; for (my $step = 0; $step < length($tmp[0]); $step++) { my $tmp_topicChar = substr($tmp[0], $step, 1); if ($tmp_topicChar eq "<") { $flipTitleIndex = 1; } if ($flipTitleIndex == 0) { $tmp_titleFiltered = $tmp_titleFiltered . $tmp_topicChar; } if ($tmp_topicChar eq ">") { $flipTitleIndex = 0; } } $topicTitle = $tmp_titleFiltered; if (length($topicTitle) > $rssDescriptionLen) { $topicTitle = substr($topicTitle, 0, $rssDescriptionLen) . "..."; } } if ($postBlockFound == 1) { $postBlockFound = 2; @tmp = split(/\"/, $line); $tmp[1] =~ s/&/&/g; $topicTitleURL = $tmp[1]; } if ($line =~ /topic_title_datawrap/i || $line =~ /topic_title datawrap/i) { $postBlockFound = 1; } } ##### Extract Poster Content if ($groupPostDataCount > ($maxFeedsTotalAccept)) { $groupPostDataCount = $maxFeedsTotalAccept; } @lines = (); $captured_response = ""; for ($x = 0; $x < $groupPostDataCount; $x++) { $topicTitleURL = $groupPostData[$x][0]; $topicTitle = $groupPostData[$x][1]; $postURL = $groupPostData[$x][2]; $postUserName = $groupPostData[$x][3]; $postTimeStamp = $groupPostData[$x][4]; $postUserThumbnail = $groupPostData[$x][5]; $postUserContent = $groupPostData[$x][6]; my $tmpWork, $posterDomain, $posterFullPath, $tmpDiscussionData; @tmp = split(/\/\//, $groupPostData[$x][2]); @tmp2 = split(/\//, $tmp[1]); $posterDomain = $tmp2[0]; for ($y = 1; $y < scalar(@tmp2); $y++) { $posterFullPath = $posterFullPath . "/" . $tmp2[$y]; } # Invoke Crawler $response = &botEngine($port, $agent_name, $agent_method, $posterDomain, $posterFullPath, $request_method, $request_protocol); # Filter Content $captured_response_filtered = $captured_response; $captured_response_filtered =~ s/\r\n//g; $captured_response_filtered =~ s/\n//g; $captured_response_filtered =~ s/\r//g; if ($captured_response_filtered =~ /(.*)<body(.*)<\/body/i) { $captured_response_filtered = $2; } # Extract Content @lined = split(/>/, $captured_response_filtered); $captured_response_filtered = ""; $postBlockFound = 0; foreach $line (@lined) { if ($postBlockFound == 6) { $postBlockFound = 0; my $tmp_postFiltered = ""; my $flipPostIndex = 0; for (my $step = 0; $step < length($postUserContent); $step++) { my $tmp_PostChar = substr($postUserContent, $step, 1); if ($tmp_PostChar eq "<") { $flipPostIndex = 1; } if ($flipPostIndex == 0) { $tmp_postFiltered = $tmp_postFiltered . $tmp_PostChar; } if ($tmp_PostChar eq ">") { $flipPostIndex = 0; } } $postUserContent = $tmp_postFiltered; if (length($postUserContent) > $rssDescriptionLen) { $postUserContent = substr($postUserContent, 0, $rssDescriptionLen) . "..."; } $groupPostData[$x][5] = $postUserThumbnail; $groupPostData[$x][6] = $postUserContent; # Clear $postURL = ""; $topicTitle = ""; $topicTitleURL = ""; $postTimeStamp = ""; $postUserName = ""; $postUserThumbnail = ""; $postUserContent = ""; } if ($postBlockFound == 5) { if ($line =~ /</) { $line = $line . ">"; } if ($line =~ /<\/div>/i) { $postBlockFound = 6; if (length($tmpDiscussionData) == 0) { $tmpDiscussionData = $line; } $postUserContent = $tmpDiscussionData; $tmpDiscussionData = ""; } else { $tmpDiscussionData = $tmpDiscussionData . $line; } } if ($postBlockFound == 4) { if ($line =~ /$postUserName<\/strong/i) { $postBlockFound = 5; } else { $postBlockFound = 0; } } if ($postBlockFound == 3) { $postBlockFound = 4; } if ($postBlockFound == 2) { if ($line =~ /uiStreamMessagembs/i || $line =~ /uiStreamMessage mbs/i) { $postBlockFound = 3; } } if ($postBlockFound == 1) { $postBlockFound = 2; @tmp = split(/src\=/, $line); @tmp2 = split(/\salt\=/, $tmp[1]); $postUserThumbnail = $tmp2[0]; $postUserThumbnail =~ s/"//g; } if ($line =~ /UIImageBlock_Image_UIImageBlock_MED_Image/i || $line =~ /UIImageBlock_Image UIImageBlock_MED_Image/i) { $postBlockFound = 1; } } # Clear $posterDomain = ""; $posterFullPath = ""; } ##### CREATE RSS/XML for ($x = 0; $x < $groupPostDataCount; $x++) { $topicTitleURL = $groupPostData[$x][0]; $topicTitle = $groupPostData[$x][1]; $postURL = $groupPostData[$x][2]; $postUserName = $groupPostData[$x][3]; $postTimeStamp = $groupPostData[$x][4]; $postUserThumbnail = $groupPostData[$x][5]; $postUserContent = $groupPostData[$x][6]; $topicTitleURL =~ s/&/&/g; $postURL =~ s/&/&/g; if (length($postUserContent) == 0) { $postUserContent = $topicTitle; } $feedContent = $feedContent . "<item>\r\n"; $feedContent = $feedContent . "<title>" . $postUserContent . "</title>\r\n"; if ($postLink eq "thumbnail") { $feedContent = $feedContent . "<link>" . $postUserThumbnail . "</link>\r\n"; } elsif ($postLink eq "topic") { $feedContent = $feedContent . "<link>" . $topicTitleURL . "</link>\r\n"; } else { $feedContent = $feedContent . "<link>" . $postURL . "</link>\r\n"; } $feedContent = $feedContent . "<description>" . $postUserContent . "</description>\r\n"; $feedContent = $feedContent . "<author>" . $postUserName . "</author>\r\n"; $feedContent = $feedContent . "<date>" . $postTimeStamp . "</date>\r\n"; $feedContent = $feedContent . "</item>\r\n"; } # Save Generated Data $filename = "$ENV{'DOCUMENT_ROOT'}/rssCopy.rss"; if (-e $filename) { chmod 0666, $filename; } open (FILE, ">$filename"); print FILE $feedContent; close (FILE); chmod 0644, $filename; # Save Day Number $filename = "$ENV{'DOCUMENT_ROOT'}/rssDay.txt"; if (-e $filename) { chmod 0666, $filename; } open (FILE, ">$filename"); print FILE (localtime)[7]; close (FILE); chmod 0644, $filename; } # End if $response else { # Error With Contacting Target. Index Local RSS Copy. $filename = "$ENV{'DOCUMENT_ROOT'}/rssCopy.rss"; open (FILE, "$filename"); @chosen = <FILE>; close (FILE); foreach (@chosen) { chop $_; $feedContent = $feedContent . $_ . "\r\n"; } } } # End if $foundDay does not match else { # On current day. Use Local RSS Copy. $filename = "$ENV{'DOCUMENT_ROOT'}/rssCopy.rss"; open (FILE, "$filename"); @chosen = <FILE>; close (FILE); foreach (@chosen) { chop $_; $feedContent = $feedContent . $_ . "\r\n"; } } ##### Generate RSS/XML Output if ($rssIncludeHeader == 1) { print "Content-type: application/rss+xml\n\n"; } else { print "Content-type: text/xml\n\n"; } print "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n"; print "<rss version=\"2.0\">\r\n"; print "<channel>\r\n"; if ($rssIncludeHeader == 1) { print "<title>" . $feed_title . "</title>\r\n"; print "<link>" . $feed_link . "</link>\r\n"; print "<description>" . $feed_description . "</description>\r\n"; print "<language>" . $feed_language . "</language>\r\n"; print "<date>" . $feed_pubdate . "</date>\r\n"; print "<copyright>" . $feed_copyright . "</copyright>\r\n"; print "<webmaster>" . $feed_webmaster . "</webmaster>\r\n"; } print $feedContent; print "</channel>\r\n"; print "</rss>\r\n"; exit; |
|
<?php /* Globals */ global $target_domain, $target_full_address, $agent_name; global $rssDescriptionLen, $maxFeedsTotalAccept, $postLink, $rssIncludeHeader; global $feed_title, $feed_link, $feed_description, $feed_language, $feed_pubdate, $feed_copyright, $feed_webmaster; global $captured_response, $feedContent; /* Connection Settings */ $target_domain = "www.facebook.com"; // Site to contact $target_full_address = "/board.php?uid=12345678901"; // Full path to public group discussion board $agent_name = $_SERVER['HTTP_USER_AGENT']; /* RSS Settings */ $rssDescriptionLen = 22; // Maximum number of characters to allow in the title $maxFeedsTotalAccept = 6; // Maximum number of links to get $postLink = "thumbnail"; // "thumbnail" - URL to thumbnail image, "topic" - URL to topic, "post" - URL to post $rssIncludeHeader = 1; // 1 - include channel information (RSS), 0 - leave channel information out (XML) $feed_title = "FEED TITLE"; $feed_link = "http://www.facebook.com/PUBLIC-GROUP/"; $feed_description = "FEED DESCRIPTION"; $feed_language = "en-us"; $feed_pubdate = "11 Apr 2001 01:01:00 GMT"; $feed_copyright = ""; $feed_webmaster = "WEBMASTER@YOUREMAIL.COM"; $captured_response = ""; $feedContent = ""; /* Date Filter */ function translateTimeStamp($rawdate) { $useSeconds = "00"; $tmpDate1 = explode("on ", $rawdate); $tmpDate2 = explode(" at ", $tmpDate1[1]); $tmpDate2[0] = str_replace(",", "", $tmpDate2[0]); $dateSegments = explode(" ", $tmpDate2[0]); $useMonth = substr($dateSegments[0], 0, 3); if ((int)$dateSegments[1] < 10) { $useDay = "0" . $dateSegments[1]; } else { $useDay = $dateSegments[1]; } $useYear = $dateSegments[2]; $timeSegments = explode(":", $tmpDate2[1]); if (preg_match("/am/i", $timeSegments[1])) { $useHour = "0" . $timeSegments[0]; } else { $useHour = 12 + (int)$timeSegments[0]; } $useMinute = $timeSegments[1]; $useMinute = str_replace("pm", "", str_replace("am", "", $useMinute)); $filteredTimeStamp = $useDay . " " . $useMonth . " " . $useYear . " " . $useHour . ":" . $useMinute . ":" . $useSeconds . "GMT"; return($filteredTimeStamp); } /* The Crawler */ function botEngine($tdomain, $taddress) { global $captured_response, $agent_name; $captured_response = ""; $targetURL = "http://" . $tdomain . $taddress; $response = ""; $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $agent_name); curl_setopt($ch, CURLOPT_URL, $targetURL); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $captured_response = curl_exec($ch); if (!$captured_response) { $response = "Connection Failed"; } else { $response = "Connection Successful"; } return($response); } /* Begin */ $postBlockFound = 0; $topicTitleURL = ""; $topicTitle = ""; $postURL = ""; $postUserName = ""; $postTimeStamp = ""; $postUserThumbnail = ""; $postUserContent = ""; $groupPostData = array(); $groupPostDataCount = 0; $filename = $_SERVER['DOCUMENT_ROOT'] . "/rssDay.txt"; $dayNumber = floor((time()-mktime(null,null,null,1,0,date("Y")))/86400); $foundDay = -1; if (file_exists($filename)) { $foundDay = (int)file_get_contents($filename, true); } if ($dayNumber != $foundDay) { /* Grab Content */ $response = botEngine($target_domain, $target_full_address); $captured_response_filtered = preg_replace("/(\r\n)+|(\n|\r)+/", "", $captured_response); $lines = explode(">", $captured_response_filtered); foreach ($lines as $line) { if ($postBlockFound == 8) { $postBlockFound = 0; $tmp = explode("</div", $line); $postTimeStamp = $tmp[0]; // Gather Output $groupPostData[$groupPostDataCount][0] = $topicTitleURL; $groupPostData[$groupPostDataCount][1] = $topicTitle; $groupPostData[$groupPostDataCount][2] = $postURL; $groupPostData[$groupPostDataCount][3] = $postUserName; $groupPostData[$groupPostDataCount][4] = translateTimeStamp($postTimeStamp); $groupPostData[$groupPostDataCount][5] = $postUserThumbnail; $groupPostData[$groupPostDataCount][6] = $postUserContent; $groupPostDataCount = $groupPostDataCount + 1; // Clear $postURL = ""; $topicTitle = ""; $topicTitleURL = ""; $postTimeStamp = ""; $postUserName = ""; } if ($postBlockFound == 7) { if (preg_match("/class=\"topic_pager\"/i", $line)) { $postBlockFound = 8; } } if ($postBlockFound == 6) { $postBlockFound = 7; $tmp = explode("</span", $line); $postUserName = $tmp[0]; } if ($postBlockFound == 5) { if (preg_match("/by <span/i", $line)) { $postBlockFound = 6; } } if ($postBlockFound == 4) { $postBlockFound = 5; $tmp = explode("\"", $line); $postURL = $tmp[1]; $postURL = str_replace("&", "&", $postURL); } if ($postBlockFound == 3) { if (preg_match("/class=\"post_user\"/i", $line)) { $postBlockFound = 4; } } if ($postBlockFound == 2) { $postBlockFound = 3; $tmp = explode("</", $line); $tmp_titleFiltered = ""; $flipTitleIndex = 0; for ($step = 0; $step < strlen($tmp[0]); $step++) { $tmp_topicChar = substr($tmp[0], $step, 1); if ($tmp_topicChar == "<") { $flipTitleIndex = 1; } if ($flipTitleIndex == 0) { $tmp_titleFiltered = $tmp_titleFiltered . $tmp_topicChar; } if ($tmp_topicChar == ">") { $flipTitleIndex = 0; } } $topicTitle = $tmp_titleFiltered; if (strlen($topicTitle) > $rssDescriptionLen) { $topicTitle = substr($topicTitle, 0, $rssDescriptionLen) . "..."; } } if ($postBlockFound == 1) { $postBlockFound = 2; $tmp = explode("\"", $line); $tmp[1] = str_replace("&", "&", $tmp[1]); $topicTitleURL = $tmp[1]; } if (preg_match("/topic_title_datawrap/i", $line) || preg_match("/topic_title datawrap/i", $line)) { $postBlockFound = 1; } } /* Extract Poster Content */ if ($groupPostDataCount > ($maxFeedsTotalAccept)) { $groupPostDataCount = $maxFeedsTotalAccept; } unset($lines); $captured_response = ""; for ($x = 0; $x < $groupPostDataCount; $x++) { $topicTitleURL = $groupPostData[$x][0]; $topicTitle = $groupPostData[$x][1]; $postURL = $groupPostData[$x][2]; $postUserName = $groupPostData[$x][3]; $postTimeStamp = $groupPostData[$x][4]; $postUserThumbnail = $groupPostData[$x][5]; $postUserContent = $groupPostData[$x][6]; $posterDomain = ""; $posterFullPath = ""; $tmp = explode("//", $postURL); $tmp2 = explode("/", $tmp[1]); $posterDomain = $tmp2[0]; for ($y = 1; $y < count($tmp2); $y++) { $posterFullPath = $posterFullPath . "/" . $tmp2[$y]; } /* Grab Content */ $postBlockFound = 0; $captured_response = ""; $captured_response_filtered = ""; $response = botEngine($posterDomain, $posterFullPath); $captured_response_filtered = preg_replace("/(\r\n)+|(\n|\r)+/", "", $captured_response); $lines = explode(">", $captured_response_filtered); foreach ($lines as $line) { if ($postBlockFound == 6) { $postBlockFound = 0; $tmp_postFiltered = ""; $flipPostIndex = 0; for ($step = 0; $step < strlen($postUserContent); $step++) { $tmp_PostChar = substr($postUserContent, $step, 1); if ($tmp_PostChar == "<") { $flipPostIndex = 1; } if ($flipPostIndex == 0) { $tmp_postFiltered = $tmp_postFiltered . $tmp_PostChar; } if ($tmpPostChar == ">") { $flipPostIndex = 0; } } $postUserContent = $tmp_postFiltered; if (strlen($postUserContent) > $rssDescriptionLen) { $postUserContent = substr($postUserContent, 0, $rssDescriptionLen) . "..."; } $groupPostData[$x][5] = $postUserThumbnail; $groupPostData[$x][6] = $postUserContent; // Clear $postURL = ""; $topicTitle = ""; $topicTitleURL = ""; $postTimeStamp = ""; $postUserName = ""; $postUserThumbnail = ""; $postUserContent = ""; } if ($postBlockFound == 5) { if (preg_match("<", $line)) { $line = $line . ">"; } if (preg_match("</div>", $line)) { $postBlockFound = 6; if (strlen($tmpDiscussionData) == 0) { $tmpDiscussionData = $line; } $postUserContent = $tmpDiscussionData; $tmpDiscussionData = ""; } } if ($postBlockFound == 4) { if (preg_match("/" . $postUserName . "</strong/i", $line)) { $postBlockFound = 5; } else { $postBlockFound = 0; } } if ($postBlockFound == 3) { $postBlockFound = 4; } if ($postBlockFound == 2) { if (preg_match("/uiStreamMessagembs/i", $line) || preg_match("/uiStreamMessage mbs/i", $line)) { $postBlockFound = 3; } } if ($postBlockFound == 1) { $postBlockFound = 2; $tmp = explode("src=", $line); $tmp2 = explode(" alt=", $tmp[1]); $postUserThumbnail = $tmp2[0]; $postUserThumbnail = str_replace("\"", "", $postUserThumbnail); } if (preg_match("/UIImageBlock_Image_UIImageBlock_MED_Image/i", $line) || preg_match("/UIImageBlock_Image UIImageBlock_MED_Image/i", $line)) { $postBlockFound = 1; } } } /* Create RSS/XML */ for ($x = 0; $x < $groupPostDataCount; $x++) { $topicTitleURL = $groupPostData[$x][0]; $topicTitle = $groupPostData[$x][1]; $postURL = $groupPostData[$x][2]; $postUserName = $groupPostData[$x][3]; $postTimeStamp = $groupPostData[$x][4]; $postUserThumbnail = $groupPostData[$x][5]; $postUserContent = $groupPostData[$x][6]; $topicTitleURL = str_replace("&", "&", $topicTitleURL); $postURL = str_replace("&", "&", $postURL); if (strlen($postUserContent) == 0) { $postUserContent = $topicTitle; } $feedContent = $feedContent . "<item>\r\n"; $feedContent = $feedContent . "<title>" . $postUserContent . "</title>\r\n"; if ($postLink == "thumbnail") { $feedContent = $feedContent . "<link>" . $postUserThumbnail . "</link>\r\n"; } else if ($postLink == "topic") { $feedContent = $feedContent . "<link>" . $topicTitleURL . "</link>\r\n"; } else { $feedContent = $feedContent . "<link>" . $postURL . "</link>\r\n"; } $feedContent = $feedContent . "<description>" . $postUserContent . "</description>\r\n"; $feedContent = $feedContent . "<author>" . $postUserName . "</author>\r\n"; $feedContent = $feedContent . "<date>" . $postTimeStamp . "</date>\r\n"; $feedContent = $feedContent . "</item>\r\n"; } /* Save Generated Data */ $filename = $_SERVER['DOCUMENT_ROOT'] . "/rssCopy.txt"; if (file_exists($filename)) { /*File Found*/ } else { /*No File*/ } $filePointer = fopen($filename, "w"); fputs($filePointer, $feedContent); fclose($filePointer); /* Save Day Number */ $filename = $_SERVER['DOCUMENT_ROOT'] . "/rssDay.txt"; if (file_exists($filename)) { /*File Found*/ } else { /*No File*/ } $filePointer = fopen($filename, "w"); fputs($filePointer, $dayNumber); fclose($filePointer); } else { /* On current day. Use Local RSS Copy. */ $filename = $_SERVER['DOCUMENT_ROOT'] . "/rssCopy.txt"; $feedContent = file_get_contents($filename, true); } /* Generate RSS/XML Output */ if ($rssIncludeHeader == 1) { header("Content-type: application/rss+xml"); } else { header("Content-type: text/xml"); } print "<" . "?" . "xml version=\"1.0\" encoding=\"utf-8\"" . "?" . ">\r\n"; print "<rss version=\"2.0\">\r\n"; print "<channel>\r\n"; if ($rssIncludeHeader == 1) { print "<title>" . $feed_title . "</title>\r\n"; print "<link>" . $feed_link . "</link>\r\n"; print "<description>" . $feed_description . "</description>\r\n"; print "<language>" . $feed_language . "</language>\r\n"; print "<date>" . $feed_pubdate . "</date>\r\n"; print "<copyright>" . $feed_copyright . "</copyright>\r\n"; print "<webmaster>" . $feed_webmaster . "</webmaster>\r\n"; } print $feedContent; print "</channel>\r\n"; print "</rss>\r\n"; exit; ?> |