Skip to content

corpus_analysis_report¤

Module Description¤

The corpus_analysis_report provides a single helper function for generating a comprehensive analysis of corpus contents.

create_corpus_analysis_report(corpus: Corpus, output_dir: str = None, console_output=True, html=False) -> str ¤

Create a comprehensive report of the corpus analysis results.

This function exports various statistics and summaries of the corpus analysis to CSV files and generates a text report.

Parameters:

Name Type Description Default
corpus Corpus

The corpus object containing documents and metadata.

required
output_dir str

The directory path where the report files will be saved.

None
console_output bool

Whether to print progress messages to the console.

True
html bool

Whether to generate an HTML report (not implemented yet).

False

Returns:

Name Type Description
str str

The generated report as a string in either HTML or Markdown format.

Source code in lexos/corpus/corpus_analysis_report.py
def create_corpus_analysis_report(
    corpus: Corpus,
    output_dir: str = None,
    console_output=True,
    html=False,
) -> str:
    """Create a comprehensive report of the corpus analysis results.

    This function exports various statistics and summaries of the corpus
    analysis to CSV files and generates a text report.

    Args:
        corpus (Corpus): The corpus object containing documents and metadata.
        output_dir (str): The directory path where the report files will be saved.
        console_output (bool): Whether to print progress messages to the console.
        html (bool): Whether to generate an HTML report (not implemented yet).

    Returns:
        str: The generated report as a string in either HTML or Markdown format.
    """
    # Get stats object once to avoid multiple calls
    stats = corpus.get_stats(active_only=True)

    # Only create output directory and save files if output_dir is provided
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)

        # 1. Export corpus data to CSV
        corpus_df = corpus.to_df()
        csv_path = output_dir / "corpus_overview.csv"
        corpus_df.to_csv(csv_path, index=False)
        if console_output:
            msg.good(f"✓ Exported corpus overview to {csv_path}")
            msg.info(f"   - {len(corpus_df)} documents")
            msg.info(f"   - {len(corpus_df.columns)} data columns")

        # 2. Export detailed statistics
        stats_df = stats.doc_stats_df
        stats_path = output_dir / "document_statistics.csv"
        stats_df.to_csv(stats_path)
        if console_output:
            msg.good(f"✓ Exported detailed statistics to {stats_path}")

        # 3. Export analysis results summary
        summary_data = {
            "corpus_name": corpus.name,
            "total_documents": corpus.num_docs,
            "active_documents": corpus.num_active_docs,
            "total_tokens": corpus.num_tokens,
            "mean_document_length": stats.mean,
            "std_document_length": stats.standard_deviation,
            "iqr_outliers_count": len(stats.iqr_outliers),
            "corpus_fingerprint": corpus._generate_corpus_fingerprint(),
        }

        # Add quality metrics
        quality = stats.corpus_quality_metrics
        summary_data.update(
            {
                "length_balance_classification": quality["document_length_balance"][
                    "classification"
                ],
                "sampling_adequacy": quality["vocabulary_richness"][
                    "sampling_adequacy"
                ],
                "size_adequacy": quality["corpus_size_metrics"]["size_adequacy"],
            }
        )

        # Add Zipf analysis
        zipf = stats.zipf_analysis
        summary_data.update(
            {
                "zipf_follows_law": zipf["follows_zipf"],
                "zipf_goodness_of_fit": zipf["zipf_goodness_of_fit"],
                "zipf_r_squared": zipf["r_squared"],
            }
        )

        summary_df = pd.DataFrame([summary_data])
        summary_path = output_dir / "corpus_summary.csv"
        summary_df.to_csv(summary_path, index=False)
        if console_output:
            msg.good(f"✓ Exported corpus summary to {summary_path}")

        # 4. Export analysis results from modules
        if corpus.analysis_results:
            results_path = output_dir / "module_analysis_results.json"
            with open(results_path, "w") as f:
                json.dump(corpus.analysis_results, f, indent=2, default=str)
            if console_output:
                msg.good(f"✓ Exported module results to {results_path}")

    # Get quality and zipf metrics from stats object
    quality = stats.corpus_quality_metrics
    zipf = stats.zipf_analysis

    # Always generate HTML first
    html_report = "<html><head><title>Corpus Analysis Report</title></head><body>"
    html_report += "<h1>Corpus Analysis Report</h1>"
    html_report += f"<p>Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}</p>"
    html_report += "<hr>"
    html_report += f"<h2>Corpus Overview</h2><ul>"
    html_report += f"<li>Name: {corpus.name}</li>"
    html_report += f"<li>Total Documents: {corpus.num_docs}</li>"
    html_report += f"<li>Active Documents: {corpus.num_active_docs}</li>"
    html_report += f"<li>Total Tokens: {corpus.num_tokens}</li>"
    html_report += "</ul>"
    html_report += f"<h2>Statistical Summary</h2><ul>"
    html_report += f"<li>Mean Length: {stats.mean:.1f} tokens</li>"
    html_report += f"<li>Standard Deviation: {stats.standard_deviation:.1f} tokens</li>"
    html_report += (
        f"<li>Shortest Document: {stats.doc_stats_df['total_tokens'].min()} tokens</li>"
    )
    html_report += (
        f"<li>Longest Document: {stats.doc_stats_df['total_tokens'].max()} tokens</li>"
    )
    html_report += f"<li>IQR Outliers: {len(stats.iqr_outliers)}</li>"
    html_report += "</ul>"
    html_report += f"<h2>Quality Assessment</h2><ul>"
    html_report += f"<li>Length Balance: {quality['document_length_balance']['classification']}</li>"
    html_report += f"<li>Sampling Adequacy: {quality['vocabulary_richness']['sampling_adequacy']}</li>"
    html_report += (
        f"<li>Size Adequacy: {quality['corpus_size_metrics']['size_adequacy']}</li>"
    )
    html_report += f"<li>Follows Zipf's Law: {zipf['follows_zipf']}</li>"
    html_report += "</ul>"
    if corpus.analysis_results:
        html_report += "<h2>Module Analyses</h2><ul>"
        for module_name, data in corpus.analysis_results.items():
            html_report += f"<li>{module_name}: Version {data['version']} ({data['timestamp']})</li>"
        html_report += "</ul>"
    html_report += "</body></html>"

    # Determine output format
    if html:
        report = html_report
        ext = ".html"
    else:
        # Convert HTML to Markdown
        report = convert(html_report)
        ext = ".md"

    # Save report if output_dir is provided
    if output_dir is not None:
        report_path = output_dir / f"analysis_report{ext}"
        with open(report_path, "w", encoding="utf-8") as f:
            f.write(report)
        if console_output:
            msg.good(f"✓ Created comprehensive report at {report_path}")

            msg.good(f"\n📂 All files saved to: {output_dir.absolute()}")
            msg.good(f"\n📋 Files created:")
            for file_path in output_dir.glob("*"):
                file_size = file_path.stat().st_size
                msg.info(f"   📄 {file_path.name}: {file_size:,} bytes")

            msg.good(f"\n💡 Sharing Tips:")
            msg.info(f"   📊 Use CSV files for data analysis in Excel/R/Python")
            msg.info(f"   📋 Share the report file for quick overview")
            msg.info(f"   🔗 Use JSON files for integration with other tools")
            msg.info(
                f"   💾 The corpus directory contains all original documents and metadata"
            )

    return report
rendering:
  show_root_heading: true
  heading_level: 3