import React, { useState } from "react";
import mammoth from "mammoth";
import { getDocument } from "pdfjs-dist";

const FileUpload = ({text, setText}) => {
  const [loading, setLoading] = useState(false);

  const handleFileChange = async (event) => {
    const file = event.target.files[0];
    if (!file) return;

    setLoading(true);
    const fileReader = new FileReader();

    fileReader.onload = async (e) => {
      const arrayBuffer = e.target.result;

      if (file.type === "application/pdf") {
        await extractTextFromPDF(arrayBuffer);
      } else if (
        file.type ===
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
      ) {
        await extractTextFromWord(arrayBuffer);
      } else {
        alert("Unsupported file type");
      }

      setLoading(false);
    };

    fileReader.onerror = () => {
      alert("Error reading file");
      setLoading(false);
    };

    fileReader.readAsArrayBuffer(file);
  };
    
  
  const cleanText = (text) => {
    return text
      .replace(/\s+/g, " ") // Replace multiple spaces with a single space
      .replace(/\n+/g, "\n") // Replace multiple newlines with a single newline
      .trim(); // Remove leading and trailing whitespace
  };

  const extractTextFromWord = async (arrayBuffer) => {
    mammoth
      .extractRawText({ arrayBuffer })
        .then((result) => {
          const cleanedText = cleanText(result.value);
        setText(cleanedText);
      })
      .catch((error) => {
        console.error("Error extracting text from Word file:", error);
      });
  };

  const extractTextFromPDF = async (arrayBuffer) => {
    try {
      const pdf = await getDocument({ data: arrayBuffer }).promise;
      let extractedText = "";
      for (let i = 1; i <= pdf.numPages; i++) {
        const page = await pdf.getPage(i);
        const textContent = await page.getTextContent();
        const textItems = textContent.items.map((item) => item.str).join(" ");
        extractedText += textItems + "\n";
      }
       const cleanedText = cleanText(extractedText);
       setText(cleanedText);
    } catch (error) {
      console.error("Error extracting text from PDF file:", error);
    }
  };

  return (
    <div>
      <input type="file" accept=".pdf,.docx" onChange={handleFileChange} />
      {loading && <p>Loading...</p>}
    </div>
  );
};

export default FileUpload;
