Safe parsing of javascript links
This commit is contained in:
parent
ad3ae9d584
commit
b5103853d6
2 changed files with 55 additions and 46 deletions
69
google.rb
69
google.rb
|
@ -1,40 +1,45 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
require 'mechanize'
|
||||
agent = Mechanize.new
|
||||
|
||||
def check_page_for_links(indexed_pages=[],address,agent)
|
||||
begin
|
||||
page = agent.get(address);
|
||||
page.links().each do | link |
|
||||
link = link.resolved_uri;
|
||||
link.fragment = nil;
|
||||
link.query = nil;
|
||||
if !(indexed_pages.include?(link))
|
||||
if link.host == address.host
|
||||
#p link
|
||||
indexed_pages.push(link)
|
||||
end
|
||||
# Function to check for links
|
||||
def check_for_links(address, agent, indexed_pages = [])
|
||||
# Check all links on the page
|
||||
page = agent.get(address)
|
||||
if page.is_a?(Mechanize::Page)
|
||||
page.links.each do |link|
|
||||
# Get a complete resolved URI for the link
|
||||
next if link.href == 'javascript:'
|
||||
link = link.resolved_uri
|
||||
# Remove the fragment so there's less overlap
|
||||
link.fragment = nil
|
||||
# Clear query (TODO: Stash away later)
|
||||
link.query = nil
|
||||
# Check if we've seen the link before
|
||||
# Check if we're leaving the host
|
||||
if !indexed_pages.include?(link) && link.host == address.host
|
||||
# Save link to array
|
||||
indexed_pages.push(link)
|
||||
end
|
||||
end
|
||||
return indexed_pages;
|
||||
rescue Mechanize::ResponseCodeError => e
|
||||
p e.response_code
|
||||
rescue
|
||||
# Return all functions
|
||||
indexed_pages
|
||||
end
|
||||
rescue Mechanize::ResponseCodeError => e
|
||||
p e.response_code
|
||||
end
|
||||
|
||||
addresses = check_for_links(URI.parse('http://git.blizzard.systems'), agent)
|
||||
old_addresses = []
|
||||
until addresses - old_addresses == []
|
||||
old_addresses = Array.new(addresses)
|
||||
addresses.each do |address|
|
||||
appending_addrs = check_for_links(address, agent, addresses)
|
||||
addresses.union(appending_addrs) unless appending_addrs.nil?
|
||||
end
|
||||
end
|
||||
|
||||
addresses = check_page_for_links([],URI::parse('http://git.blizzard.systems'),agent);
|
||||
old_addresses = Array.new();
|
||||
#p addresses;
|
||||
until addresses - old_addresses == [] do
|
||||
old_addresses = Array.new(addresses);
|
||||
addresses.each do | address |
|
||||
appending_addrs = check_page_for_links(addresses,address,agent);
|
||||
if !appending_addrs.nil?
|
||||
addresses.union(appending_addrs);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
p addresses;
|
||||
puts "********************************************";
|
||||
p old_addresses;
|
||||
p addresses
|
||||
puts '********************************************'
|
||||
p old_addresses
|
||||
|
|
|
@ -23,22 +23,26 @@ end
|
|||
def check_for_links(address, agent, indexed_pages = [])
|
||||
# Check all links on the page
|
||||
page = agent.get(address)
|
||||
page.links.each do |link|
|
||||
# Get a complete resolved URI for the link
|
||||
link = link.resolved_uri
|
||||
# Remove the fragment so there's less overlap
|
||||
link.fragment = nil
|
||||
# Clear query (TODO: Stash away later)
|
||||
link.query = nil
|
||||
# Check if we've seen the link before
|
||||
# Check if we're leaving the host
|
||||
if !indexed_pages.include?(link) && link.host == address.host
|
||||
# Save link to array
|
||||
indexed_pages.push(link)
|
||||
if page.is_a?(Mechanize::Page)
|
||||
page.links.each do |link|
|
||||
# Get a complete resolved URI for the link
|
||||
next if link.href == 'javascript:'
|
||||
|
||||
link = link.resolved_uri
|
||||
# Remove the fragment so there's less overlap
|
||||
link.fragment = nil
|
||||
# Clear query (TODO: Stash away later)
|
||||
link.query = nil
|
||||
# Check if we've seen the link before
|
||||
# Check if we're leaving the host
|
||||
if !indexed_pages.include?(link) && link.host == address.host
|
||||
# Save link to array
|
||||
indexed_pages.push(link)
|
||||
end
|
||||
end
|
||||
# Return all functions
|
||||
indexed_pages
|
||||
end
|
||||
# Return all functions
|
||||
indexed_pages
|
||||
rescue Mechanize::ResponseCodeError => e
|
||||
p e.response_code
|
||||
end
|
||||
|
|
Loading…
Add table
Reference in a new issue