html->array

Colin Holgate colinholgate at gmail.com
Sat Nov 14 11:07:39 EST 2015


When I wrote something to do page scraping I used mostly string tricks to reduce the source to a list of what I wanted. Below is the whole script, and the different handlers let me get the text, images, videos, or links from a page:


global gPageURL

function getText pPageSource
   put replaceText(pPageSource,"(?:<(?P<tag>script|style)[\s\S]*?</(?P=tag)>)|(?:<!--[\s\S]*?-->)|(?:<[\s\S]*?>)","") into pPageSource
   replace lf with "" in pPageSource
   replace tab with " " in pPageSource
   return pPageSource
end getText

function getMedia pPageSource
   put ".jpg,.png,.gif,.jpeg,.mov,.mp4,m4v,.mp3" into tExtensions
   repeat with a = 1 to the number of items in tExtensions
      put item a of tExtensions into tExtension
      replace tExtension with tExtension & "*" & return in pPageSource
   end repeat
   repeat with a = the number of lines in pPageSource down to 1
      put line a of pPageSource into tLine
      if the last char of tLine is "*" then
         delete the last char of tLine
         put removeLeaders(gPageURL,tLine) into line a of pPageSource
      else
         delete line a of pPageSource
      end if
   end repeat
   return pPageSource
end getMedia

function removeLeaders pPageURL,pLinkURL
   put quote&"'()" into tDelimiters
   repeat with a = 1 to the number of chars in tDelimiters
      put char a of tDelimiters into tDelimiter
      set the itemdelimiter to tDelimiter
      put the last item of pLinkURL into pLinkURL
   end repeat
   return getPath(pPageURL,pLinkURL)
end removeLeaders

function getLinks pPageSource
   replace "/a>" with "/a>" & return in pPageSource
   replace "<a" with return & "<a" in pPageSource
   filter pPageSource with "*a href*" & quote & "http*/a>"
   set the itemdelimiter to quote
   repeat with a = 1 to the number of lines in pPageSource
      put getPath(gPageURL,item 2 of line a of pPageSource) into line a of pPageSource
   end repeat
   sort pPageSource
   put line 1 of pPageSource into tLinks
   repeat with a = 2 to the number of lines in pPageSource
      if line a of pPageSource <> the last line of tLinks then
         put return & line a of pPageSource after tLinks
      end if
   end repeat   
   return tLinks
end getLinks

function getPath pPageURL,pLinkURL
   if pLinkURL contains "://" then
      return pLinkURL
   end if
   set the itemdelimiter to "/"
   if char 1 of pLinkURL is "/" then
      return item 1 to 3 of pPageURL & pLinkURL
   else
      if char 1 to 3 of pLinkURL is "../" then
         delete the last item of pPageURL
         delete the last item of pPageURL
         return pPageURL & char 3 to 999 of pLinkURL
      else
         delete the last item of pPageURL
         return pPageURL & "/" & pLinkURL
      end if
   end if
   return ""
end getPath

on showMedia pMediaFile
   if there is an image "mediaImage" then delete image "mediaImage"
   set the itemdelimiter to "."
   switch (the last item of pMediaFile)
      case "png"
      case "gif"
      case "jpg"
      case "jpeg"
         new image
         set the name of image the number of images to "mediaImage"
         set the filename of image "mediaImage" to pMediaFile
         break
      case "mp4"
      case "m4v"
      case "mov"
      case "mp3"
         set the showController of the templatePlayer to true
         play video pMediaFile
         break
   end switch
end showMedia





More information about the use-livecode mailing list