Exporting Usage Manual: Difference between revisions

From GNU Radio
Jump to navigation Jump to search
No edit summary
No edit summary
Line 12: Line 12:
     import HTMLParser  
     import HTMLParser  
     import os
     import os
 
   
     categories = ['Usage Manual', 'Block Docs'] # Categories which to export, should match the names in wiki
     categories = ['Usage Manual', 'Block Docs'] # Categories which to export, should match the names in wiki
     directory_names = ['Usage Manual', 'Block Docs'] # names of directories that are created for each category, and prefix to file names
     directory_names = ['Usage Manual', 'Block Docs'] # names of directories that are created for each category, and prefix to file names
 
   
     # set up web driver
     # set up web driver
     driver = webdriver.Firefox(executable_path='/home/marc/Downloads/geckodriver')
     driver = webdriver.Firefox(executable_path='/home/marc/Downloads/geckodriver')
       
       
     for i in range(len(categories)):
     for i in range(len(categories)):
         # make directory if it doesn't exist
         # make directory if it doesn't exist
         if not os.path.exists(directory_names[i]): # will be in the same location as this script
         if not os.path.exists(directory_names[i]): # will be in the same location as this script
             os.makedirs(directory_names[i])
             os.makedirs(directory_names[i])
 
   
         # Go to the wiki's export page
         # Go to the wiki's export page
         driver.get("https://wiki.gnuradio.org/index.php/Special:Export")
         driver.get("https://wiki.gnuradio.org/index.php/Special:Export")
 
   
         # fill in text box
         # fill in text box
         text_area = driver.find_element_by_xpath("//*[@name='catname']")
         text_area = driver.find_element_by_xpath("//*[@name='catname']")
         text_area.send_keys(categories[i])
         text_area.send_keys(categories[i])
 
   
         # Hit Add button
         # Hit Add button
         submit_button = driver.find_element_by_xpath("//*[@value='Add']")
         submit_button = driver.find_element_by_xpath("//*[@value='Add']")
         submit_button.click()  
         submit_button.click()  
 
   
         # uncheck "save as file" box
         # uncheck "save as file" box
         check_box = driver.find_element_by_xpath("//*[@name='wpDownload']")
         check_box = driver.find_element_by_xpath("//*[@name='wpDownload']")
         check_box.click()
         check_box.click()
 
   
         # hit Export
         # hit Export
         submit_button = driver.find_element_by_xpath("//*[@value='Export']")
         submit_button = driver.find_element_by_xpath("//*[@value='Export']")
         submit_button.click()  
         submit_button.click()  
 
   
         # get HTML of new page
         # get HTML of new page
         raw_html = driver.page_source
         raw_html = driver.page_source
         start_index = raw_html.find('<page>')
         start_index = raw_html.find('<page>')
         cropped_html = raw_html[start_index:]
         cropped_html = raw_html[start_index:]
 
   
         while True:
         while True:
             indx1 = cropped_html.find('<title>')
             indx1 = cropped_html.find('<title>')
             indx2 = cropped_html.find('</title>')
             indx2 = cropped_html.find('</title>')
           
           
             # check if we are done
             # check if we are done
             if (indx2 - indx1) == 0: # happens when it doesnt find a <title>
             if (indx2 - indx1) == 0: # happens when it doesnt find a <title>
                 break
                 break
               
               
             title = cropped_html[indx1 + 7 : indx2]
             title = cropped_html[indx1 + 7 : indx2]
             indx3 = cropped_html.find('[[Category:' + categories[i] + ']]') # using this instead of <text> because <text> has different numbers each time
             indx3 = cropped_html.find('[[Category:' + categories[i] + ']]') # using this instead of <text> because <text> has different numbers each time
             indx4 = cropped_html.find('</text>')
             indx4 = cropped_html.find('</text>')
             body = cropped_html[indx3 + len(categories[i]) + 13 : indx4] #  
             body = cropped_html[indx3 + len(categories[i]) + 13 : indx4] #  
           
           
             # save body to file
             # save body to file
             h = HTMLParser.HTMLParser()
             h = HTMLParser.HTMLParser()
Line 72: Line 72:
             # remove the page we just saved
             # remove the page we just saved
             cropped_html = cropped_html[indx2 + 8:] # doesnt really matter how much you add here, just needs to move on to the next <title>
             cropped_html = cropped_html[indx2 + 8:] # doesnt really matter how much you add here, just needs to move on to the next <title>
 
   
     driver.quit() # closes window
     driver.quit() # closes window

Revision as of 03:28, 13 March 2019

One of the issues people raised with moving the non-block/function/class specific documentation from Doxygen to this wiki is that people won't have a local copy of the Usage Manual anymore. To solve this I put together a little script that exports a list of pages on this wiki to the raw source which looks almost exactly the same as the doxygen source. This script just has to be run a few times a year and then the resulting files committed to gnuradio git (the location of these files within the git repo has yet to be determined).

To use this script, you will have to download geckodriver and change the line that points to where it is located. After running it there should be a new directory called "Usage Manual" created in whatever directory you ran the script from, containing a bunch of text files.

  1. Download latest version of geckodriver from here and extract it to a known location, then modify the line "driver = webdriver.Firefox(..." below (I've had the best luck with v0.21)
  2. sudo pip install selenium HTMLParser
  3. Make sure the two versions match, see this table
   from selenium import webdriver
   from selenium.webdriver.common.keys import Keys
   import time
   import HTMLParser 
   import os
    
   categories = ['Usage Manual', 'Block Docs'] # Categories which to export, should match the names in wiki
   directory_names = ['Usage Manual', 'Block Docs'] # names of directories that are created for each category, and prefix to file names
    
   # set up web driver
   driver = webdriver.Firefox(executable_path='/home/marc/Downloads/geckodriver')
        
   for i in range(len(categories)):
       # make directory if it doesn't exist
       if not os.path.exists(directory_names[i]): # will be in the same location as this script
           os.makedirs(directory_names[i])
    
       # Go to the wiki's export page
       driver.get("https://wiki.gnuradio.org/index.php/Special:Export")
    
       # fill in text box
       text_area = driver.find_element_by_xpath("//*[@name='catname']")
       text_area.send_keys(categories[i])
    
       # Hit Add button
       submit_button = driver.find_element_by_xpath("//*[@value='Add']")
       submit_button.click() 
    
       # uncheck "save as file" box
       check_box = driver.find_element_by_xpath("//*[@name='wpDownload']")
       check_box.click()
    
       # hit Export
       submit_button = driver.find_element_by_xpath("//*[@value='Export']")
       submit_button.click() 
    
       # get HTML of new page
       raw_html = driver.page_source
       start_index = raw_html.find('<page>')
       cropped_html = raw_html[start_index:]
    
       while True:
           indx1 = cropped_html.find('<title>')
           indx2 = cropped_html.find('</title>')
            
           # check if we are done
           if (indx2 - indx1) == 0: # happens when it doesnt find a <title>
               break
                
           title = cropped_html[indx1 + 7 : indx2]
           indx3 = cropped_html.find('[[Category:' + categories[i] + ']]') # using this instead of <text> because <text> has different numbers each time
           indx4 = cropped_html.find('</text>')
           body = cropped_html[indx3 + len(categories[i]) + 13 : indx4] # 
            
           # save body to file
           h = HTMLParser.HTMLParser()
           body_text = h.unescape(body) # makes it so stuff like &gt shows up as a greater than sign
           file_name = directory_names[i] + "- " + title + ".txt"
           text_file = open(directory_names[i] + '/' + file_name, "w")
           text_file.write('===' + title + '===\n')
           text_file.write(body_text)
           text_file.close()
           
           # remove the page we just saved
           cropped_html = cropped_html[indx2 + 8:] # doesnt really matter how much you add here, just needs to move on to the next <title>
    
   driver.quit() # closes window