Skip to content Skip to sidebar Skip to footer

Jsoup : How To Parse Multiple Html Files From Local Drive?

I've got multiple HTML files on my hdd to parse with Jsoup. I've been able to parse one file but not multiple files. I would like to parse all the files of a folder. I wrote this c

Solution 1:

Extract the code to parse html in a method; list the content of your directory and call parse for each file

Fileinput=newFile("C:/html");
   File[] st = input.listFiles();
   for (inti=0; i < st.length; i++) {
          if(st[i].isFile()){//other condition like name ends in html
                 parse(st[i]);
          }
   }

so your code should look like this:

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

publicclassMain {

    publicstaticvoidmain(String[] args) {
        Fileinput=newFile("C:/html");
        File[] st = input.listFiles();
        for (inti=0; i < st.length; i++) {
            if(st[i].isFile()){//other condition like name ends in html
                parse(st[i]);
            }
        }

    }

    privatestaticvoidparse(File input ) {
        Document doc;

        try{

            doc = Jsoup.parse(input, "UTF-8", "");


            Elementsids= doc.select("div[id^=desk] p");

            for (Element id : ids){

                System.out.println("\n"+id.text());

            }

        }catch(IOException e){

        }
    }
}

Solution 2:

I have written program to read folder and inner folder for given path and write results into csv

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

publicclassfixingCode {

    publicstaticvoidmain(String[] args) {
        FileWriterwriter=null;

        System.out.println("--------------------------Program started--------------------------");

        Fileinput=newFile(
                "C:\\My Web Sites\\\\library\\math");//reading file from parent folder try {
            writer = newFileWriter("c:\\Temp\\results.csv");//writing file on path
            Process(input, writer);

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {

            try {

                writer.flush();
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //

        System.out.println("--------------------------Program End--------------------------");
    }

    staticintspc_count= -1;

    staticvoidProcess(File aFile, FileWriter writer) {
        spc_count++;
        Documentdoc=null;
        Stringspcs="";

        try {

            //for (inti=0; i < spc_count; i++)
                spcs += " ";
            if (aFile.isFile()) {
                System.out.println(spcs + "[FILE] " + aFile.getName());
            } elseif (aFile.isDirectory()) {
                //
                System.out.println(spcs + "[DIR] " + aFile.getName());
                //
                File[] listOfFiles = aFile.listFiles();
                //
                File[] st = listOfFiles;

                //for (inti=0; i < st.length; i++) {
                    if (st[i].isFile()) {// other condition like name// ends in

                        doc = Jsoup.parse(st[i], null);

                        // get page titleStringtitle= doc.title();
                        System.out.println("title : " + "[" + i + "]" + title);
                        //StringownText= doc.body().ownText();
                        Stringtext= doc.body().text();
                        //// System.out.println("ownText" + ownText + "\n");
                        System.out.println("text" + text);
                        //

                        writer.append("title : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(title);
                        writer.append('\n');

                        /*
                         * writer.append("ownText"); writer.append(',');
                         * writer.append(ownText); writer.append('\n');
                         */

                        writer.append("text : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(text);
                        writer.append('\n');
                    }
                    ////if (listOfFiles != null) {
                        //for (intj=0; j < listOfFiles.length; j++)
                            Process(listOfFiles[j], writer);
                    } else {
                        System.out.println(spcs + " [ACCESS DENIED]");
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        spc_count--;
    }

}

Post a Comment for "Jsoup : How To Parse Multiple Html Files From Local Drive?"