Safe parsing of javascript links

This commit is contained in:
Blizzard Finnegan 2024-09-21 18:19:06 -04:00
parent ad3ae9d584
commit b5103853d6
Signed by: blizzardfinnegan
GPG key ID: 61C1E13067E0018E
2 changed files with 55 additions and 46 deletions

View file

@ -1,40 +1,45 @@
# frozen_string_literal: true
require 'mechanize'
agent = Mechanize.new
def check_page_for_links(indexed_pages=[],address,agent)
begin
page = agent.get(address);
page.links().each do | link |
link = link.resolved_uri;
link.fragment = nil;
link.query = nil;
if !(indexed_pages.include?(link))
if link.host == address.host
#p link
indexed_pages.push(link)
end
# Function to check for links
def check_for_links(address, agent, indexed_pages = [])
# Check all links on the page
page = agent.get(address)
if page.is_a?(Mechanize::Page)
page.links.each do |link|
# Get a complete resolved URI for the link
next if link.href == 'javascript:'
link = link.resolved_uri
# Remove the fragment so there's less overlap
link.fragment = nil
# Clear query (TODO: Stash away later)
link.query = nil
# Check if we've seen the link before
# Check if we're leaving the host
if !indexed_pages.include?(link) && link.host == address.host
# Save link to array
indexed_pages.push(link)
end
end
return indexed_pages;
rescue Mechanize::ResponseCodeError => e
p e.response_code
rescue
# Return all functions
indexed_pages
end
rescue Mechanize::ResponseCodeError => e
p e.response_code
end
addresses = check_for_links(URI.parse('http://git.blizzard.systems'), agent)
old_addresses = []
until addresses - old_addresses == []
old_addresses = Array.new(addresses)
addresses.each do |address|
appending_addrs = check_for_links(address, agent, addresses)
addresses.union(appending_addrs) unless appending_addrs.nil?
end
end
addresses = check_page_for_links([],URI::parse('http://git.blizzard.systems'),agent);
old_addresses = Array.new();
#p addresses;
until addresses - old_addresses == [] do
old_addresses = Array.new(addresses);
addresses.each do | address |
appending_addrs = check_page_for_links(addresses,address,agent);
if !appending_addrs.nil?
addresses.union(appending_addrs);
end
end
end
p addresses;
puts "********************************************";
p old_addresses;
p addresses
puts '********************************************'
p old_addresses

View file

@ -23,22 +23,26 @@ end
def check_for_links(address, agent, indexed_pages = [])
# Check all links on the page
page = agent.get(address)
page.links.each do |link|
# Get a complete resolved URI for the link
link = link.resolved_uri
# Remove the fragment so there's less overlap
link.fragment = nil
# Clear query (TODO: Stash away later)
link.query = nil
# Check if we've seen the link before
# Check if we're leaving the host
if !indexed_pages.include?(link) && link.host == address.host
# Save link to array
indexed_pages.push(link)
if page.is_a?(Mechanize::Page)
page.links.each do |link|
# Get a complete resolved URI for the link
next if link.href == 'javascript:'
link = link.resolved_uri
# Remove the fragment so there's less overlap
link.fragment = nil
# Clear query (TODO: Stash away later)
link.query = nil
# Check if we've seen the link before
# Check if we're leaving the host
if !indexed_pages.include?(link) && link.host == address.host
# Save link to array
indexed_pages.push(link)
end
end
# Return all functions
indexed_pages
end
# Return all functions
indexed_pages
rescue Mechanize::ResponseCodeError => e
p e.response_code
end