Add logistic regression sentiment analysis

This commit is contained in:
samhithamuvva 2024-10-10 12:43:06 -07:00
parent dd9bfa8d33
commit 422e383d8f
23 changed files with 1116 additions and 378 deletions

View File

@ -227,6 +227,9 @@ nlp = en_core_web_sm.load()
doc = nlp("This is a sentence.")
```
📖 **For more info and examples, check out the
[models documentation](https://spacy.io/docs/usage/models).**
## 📊 Custom Sentiment Analysis with Logistic Regression (spaCy-based)
This repository also includes a custom **Logistic Regression** sentiment analysis model built using spaCy, without using scikit-learn. The model classifies text as positive or negative based on a dataset such as IMDb reviews.
@ -234,24 +237,28 @@ This repository also includes a custom **Logistic Regression** sentiment analysi
To run the logistic regression model:
```bash
python pure_Logistic.py
```This script processes the dataset using spaCy, trains the logistic regression model, and outputs the results.
```
This script processes the dataset using spaCy, trains the logistic regression model, and outputs the results.
### Testing and Evaluation
To run tests and evaluate the model's performance:
To run tests and evaluate the model's performance, use:
```bash
python test_pure_logistic.py
```
In your test script, import the PureLogisticTextCategorizer class for evaluation:
To use the model in your own code:
In your test script,
```bash
import the PureLogisticTextCategorizer class for evaluation:
from pure_Logistic import PureLogisticTextCategorizer
```
# Initialize and use the classifier
categorizer = PureLogisticTextCategorizer()
```
This enables you to evaluate the logistic regression classifier on your test cases.
📖 **For more info and examples, check out the
[models documentation](https://spacy.io/docs/usage/models).**
## ⚒ Compile from source
The other way to install spaCy is to clone its

View File

@ -0,0 +1,138 @@
import spacy
from spacy.training import Example
from spacy.tokens import Doc
from typing import Dict, List
# Import the custom logistic classifier
from pure_Logistic import make_pure_logistic_textcat
# Registering the custom extension 'textcat' to store predictions
if not Doc.has_extension("textcat"):
Doc.set_extension("textcat", default={})
# Sample training and testing data
TRAIN_DATA = [
("This product is amazing! I love it.", {"cats": {"positive": 1.0, "negative": 0.0}}),
("The service was excellent and staff very friendly.", {"cats": {"positive": 1.0, "negative": 0.0}}),
("I'm really impressed with the quality.", {"cats": {"positive": 1.0, "negative": 0.0}}),
("Best purchase I've made in years!", {"cats": {"positive": 1.0, "negative": 0.0}}),
("The features work exactly as advertised.", {"cats": {"positive": 1.0, "negative": 0.0}}),
("This is terrible, complete waste of money.", {"cats": {"positive": 0.0, "negative": 1.0}}),
("Poor customer service, very disappointing.", {"cats": {"positive": 0.0, "negative": 1.0}}),
("The product broke after one week.", {"cats": {"positive": 0.0, "negative": 1.0}}),
("Would not recommend to anyone.", {"cats": {"positive": 0.0, "negative": 1.0}}),
("Save your money and avoid this.", {"cats": {"positive": 0.0, "negative": 1.0}})
]
TEST_DATA = [
("Great product, highly recommend!", {"cats": {"positive": 1.0, "negative": 0.0}}),
("Not worth the price at all.", {"cats": {"positive": 0.0, "negative": 1.0}}),
("Everything works perfectly.", {"cats": {"positive": 1.0, "negative": 0.0}}),
("Disappointed with the results.", {"cats": {"positive": 0.0, "negative": 1.0}})
]
def calculate_metrics(true_positives: int, true_negatives: int, false_positives: int, false_negatives: int) -> Dict[str, float]:
"""Calculate evaluation metrics based on counts."""
total = true_positives + true_negatives + false_positives + false_negatives
accuracy = (true_positives + true_negatives) / total if total > 0 else 0
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
return {
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1
}
def evaluate_model(nlp, test_data):
"""Evaluate the model using the test data."""
true_positives = true_negatives = false_positives = false_negatives = 0
predictions = []
for text, annotations in test_data:
doc = nlp(text)
true_cats = annotations["cats"]
pred_cats = doc._.textcat # Predictions from the custom model
# Extract scores for 'positive' and 'negative'
pred_positive_score = pred_cats["positive"] if "positive" in pred_cats else 0.0
true_positive_score = true_cats.get("positive", 0.0)
pred_positive = float(pred_positive_score) > 0.5
true_positive = float(true_positive_score) > 0.5
# Update counts based on predictions
if true_positive and pred_positive:
true_positives += 1
elif not true_positive and not pred_positive:
true_negatives += 1
elif not true_positive and pred_positive:
false_positives += 1
else:
false_negatives += 1
predictions.append({
"text": text,
"true": "positive" if true_positive else "negative",
"predicted": "positive" if pred_positive else "negative",
"scores": pred_cats
})
metrics = calculate_metrics(true_positives, true_negatives, false_positives, false_negatives)
return metrics, predictions
def main():
try:
print("Loading spaCy model...")
nlp = spacy.load("en_core_web_lg")
except OSError:
print("Downloading spaCy model...")
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
print("Adding custom text categorizer...")
config = {
"learning_rate": 0.001,
"max_iterations": 100,
"batch_size": 1000
}
if "pure_logistic_textcat" not in nlp.pipe_names:
textcat = nlp.add_pipe("pure_logistic_textcat", config=config)
textcat.labels = {"positive", "negative"}
print("Preparing training examples...")
train_examples = []
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
train_examples.append(example)
print("Training the model...")
textcat = nlp.get_pipe("pure_logistic_textcat")
losses = textcat.update(train_examples)
print(f"Training losses: {losses}")
print("\nEvaluating the model...")
metrics, predictions = evaluate_model(nlp, TEST_DATA)
print("\nEvaluation Metrics:")
print(f"Accuracy: {metrics['accuracy']:.3f}")
print(f"Precision: {metrics['precision']:.3f}")
print(f"Recall: {metrics['recall']:.3f}")
print(f"F1 Score: {metrics['f1']:.3f}")
print("\nDetailed Predictions:")
for pred in predictions:
print(f"\nText: {pred['text']}")
print(f"True label: {pred['true']}")
print(f"Predicted: {pred['predicted']}")
print(f"Positive score: {pred['scores']['positive']:.3f}")
print(f"Negative score: {pred['scores']['negative']:.3f}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,443 @@
<#
.Synopsis
Activate a Python virtual environment for the current PowerShell session.
.Description
Pushes the python executable for a virtual environment to the front of the
$Env:PATH environment variable and sets the prompt to signify that you are
in a Python virtual environment. Makes use of the command line switches as
well as the `pyvenv.cfg` file values present in the virtual environment.
.Parameter VenvDir
Path to the directory that contains the virtual environment to activate. The
default value for this is the parent of the directory that the Activate.ps1
script is located within.
.Parameter Prompt
The prompt prefix to display when this virtual environment is activated. By
default, this prompt is the name of the virtual environment folder (VenvDir)
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
.Example
Activate.ps1
Activates the Python virtual environment that contains the Activate.ps1 script.
.Example
Activate.ps1 -Verbose
Activates the Python virtual environment that contains the Activate.ps1 script,
and shows extra information about the activation as it executes.
.Example
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
Activates the Python virtual environment located in the specified location.
.Example
Activate.ps1 -Prompt "MyPython"
Activates the Python virtual environment that contains the Activate.ps1 script,
and prefixes the current prompt with the specified string (surrounded in
parentheses) while the virtual environment is active.
.Notes
On Windows, it may be required to enable this Activate.ps1 script by setting the
execution policy for the user. You can do this by issuing the following PowerShell
command:
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
For more information on Execution Policies:
https://go.microsoft.com/fwlink/?LinkID=135170
#>
Param(
[Parameter(Mandatory = $false)]
[String]
$VenvDir,
[Parameter(Mandatory = $false)]
[String]
$Prompt
)
<# Function declarations --------------------------------------------------- #>
<#
.Synopsis
Remove all shell session elements added by the Activate script, including the
addition of the virtual environment's Python executable from the beginning of
the PATH variable.
.Parameter NonDestructive
If present, do not remove this function from the global namespace for the
session.
#>
function global:deactivate ([switch]$NonDestructive) {
# Revert to original values
# The prior prompt:
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
}
# The prior PYTHONHOME:
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
}
# The prior PATH:
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
}
# Just remove the VIRTUAL_ENV altogether:
if (Test-Path -Path Env:VIRTUAL_ENV) {
Remove-Item -Path env:VIRTUAL_ENV
}
# Just remove VIRTUAL_ENV_PROMPT altogether.
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
}
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
}
# Leave deactivate function in the global namespace if requested:
if (-not $NonDestructive) {
Remove-Item -Path function:deactivate
}
}
<#
.Description
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
given folder, and returns them in a map.
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
two strings separated by `=` (with any amount of whitespace surrounding the =)
then it is considered a `key = value` line. The left hand string is the key,
the right hand is the value.
If the value starts with a `'` or a `"` then the first and last character is
stripped from the value before being captured.
.Parameter ConfigDir
Path to the directory that contains the `pyvenv.cfg` file.
#>
function Get-PyVenvConfig(
[String]
$ConfigDir
) {
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
# An empty map will be returned if no config file is found.
$pyvenvConfig = @{ }
if ($pyvenvConfigPath) {
Write-Verbose "File exists, parse `key = value` lines"
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
$pyvenvConfigContent | ForEach-Object {
$keyval = $PSItem -split "\s*=\s*", 2
if ($keyval[0] -and $keyval[1]) {
$val = $keyval[1]
# Remove extraneous quotations around a string value.
if ("'""".Contains($val.Substring(0, 1))) {
$val = $val.Substring(1, $val.Length - 2)
}
$pyvenvConfig[$keyval[0]] = $val
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
}
}
}
return $pyvenvConfig
}
<# Begin Activate script --------------------------------------------------- #>
# Determine the containing directory of this script
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
$VenvExecDir = Get-Item -Path $VenvExecPath
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
# Set values required in priority: CmdLine, ConfigFile, Default
# First, get the location of the virtual environment, it might not be
# VenvExecDir if specified on the command line.
if ($VenvDir) {
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
}
else {
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
Write-Verbose "VenvDir=$VenvDir"
}
# Next, read the `pyvenv.cfg` file to determine any required value such
# as `prompt`.
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
# Next, set the prompt from the command line, or the config file, or
# just use the name of the virtual environment folder.
if ($Prompt) {
Write-Verbose "Prompt specified as argument, using '$Prompt'"
}
else {
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
$Prompt = $pyvenvCfg['prompt'];
}
else {
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
$Prompt = Split-Path -Path $venvDir -Leaf
}
}
Write-Verbose "Prompt = '$Prompt'"
Write-Verbose "VenvDir='$VenvDir'"
# Deactivate any currently active virtual environment, but leave the
# deactivate function in place.
deactivate -nondestructive
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
# that there is an activated venv.
$env:VIRTUAL_ENV = $VenvDir
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
Write-Verbose "Setting prompt to '$Prompt'"
# Set the prompt to include the env name
# Make sure _OLD_VIRTUAL_PROMPT is global
function global:_OLD_VIRTUAL_PROMPT { "" }
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
function global:prompt {
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
_OLD_VIRTUAL_PROMPT
}
$env:VIRTUAL_ENV_PROMPT = $Prompt
}
# Clear PYTHONHOME
if (Test-Path -Path Env:PYTHONHOME) {
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
Remove-Item -Path Env:PYTHONHOME
}
# Add the venv to the PATH
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
# SIG # Begin signature block
# MIIj/wYJKoZIhvcNAQcCoIIj8DCCI+wCAQExDzANBglghkgBZQMEAgEFADB5Bgor
# BgEEAYI3AgEEoGswaTA0BgorBgEEAYI3AgEeMCYCAwEAAAQQH8w7YFlLCE63JNLG
# KX7zUQIBAAIBAAIBAAIBAAIBADAxMA0GCWCGSAFlAwQCAQUABCBnL745ElCYk8vk
# dBtMuQhLeWJ3ZGfzKW4DHCYzAn+QB6CCDi8wggawMIIEmKADAgECAhAIrUCyYNKc
# TJ9ezam9k67ZMA0GCSqGSIb3DQEBDAUAMGIxCzAJBgNVBAYTAlVTMRUwEwYDVQQK
# EwxEaWdpQ2VydCBJbmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5jb20xITAfBgNV
# BAMTGERpZ2lDZXJ0IFRydXN0ZWQgUm9vdCBHNDAeFw0yMTA0MjkwMDAwMDBaFw0z
# NjA0MjgyMzU5NTlaMGkxCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5EaWdpQ2VydCwg
# SW5jLjFBMD8GA1UEAxM4RGlnaUNlcnQgVHJ1c3RlZCBHNCBDb2RlIFNpZ25pbmcg
# UlNBNDA5NiBTSEEzODQgMjAyMSBDQTEwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAw
# ggIKAoICAQDVtC9C0CiteLdd1TlZG7GIQvUzjOs9gZdwxbvEhSYwn6SOaNhc9es0
# JAfhS0/TeEP0F9ce2vnS1WcaUk8OoVf8iJnBkcyBAz5NcCRks43iCH00fUyAVxJr
# Q5qZ8sU7H/Lvy0daE6ZMswEgJfMQ04uy+wjwiuCdCcBlp/qYgEk1hz1RGeiQIXhF
# LqGfLOEYwhrMxe6TSXBCMo/7xuoc82VokaJNTIIRSFJo3hC9FFdd6BgTZcV/sk+F
# LEikVoQ11vkunKoAFdE3/hoGlMJ8yOobMubKwvSnowMOdKWvObarYBLj6Na59zHh
# 3K3kGKDYwSNHR7OhD26jq22YBoMbt2pnLdK9RBqSEIGPsDsJ18ebMlrC/2pgVItJ
# wZPt4bRc4G/rJvmM1bL5OBDm6s6R9b7T+2+TYTRcvJNFKIM2KmYoX7BzzosmJQay
# g9Rc9hUZTO1i4F4z8ujo7AqnsAMrkbI2eb73rQgedaZlzLvjSFDzd5Ea/ttQokbI
# YViY9XwCFjyDKK05huzUtw1T0PhH5nUwjewwk3YUpltLXXRhTT8SkXbev1jLchAp
# QfDVxW0mdmgRQRNYmtwmKwH0iU1Z23jPgUo+QEdfyYFQc4UQIyFZYIpkVMHMIRro
# OBl8ZhzNeDhFMJlP/2NPTLuqDQhTQXxYPUez+rbsjDIJAsxsPAxWEQIDAQABo4IB
# WTCCAVUwEgYDVR0TAQH/BAgwBgEB/wIBADAdBgNVHQ4EFgQUaDfg67Y7+F8Rhvv+
# YXsIiGX0TkIwHwYDVR0jBBgwFoAU7NfjgtJxXWRM3y5nP+e6mK4cD08wDgYDVR0P
# AQH/BAQDAgGGMBMGA1UdJQQMMAoGCCsGAQUFBwMDMHcGCCsGAQUFBwEBBGswaTAk
# BggrBgEFBQcwAYYYaHR0cDovL29jc3AuZGlnaWNlcnQuY29tMEEGCCsGAQUFBzAC
# hjVodHRwOi8vY2FjZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNlcnRUcnVzdGVkUm9v
# dEc0LmNydDBDBgNVHR8EPDA6MDigNqA0hjJodHRwOi8vY3JsMy5kaWdpY2VydC5j
# b20vRGlnaUNlcnRUcnVzdGVkUm9vdEc0LmNybDAcBgNVHSAEFTATMAcGBWeBDAED
# MAgGBmeBDAEEATANBgkqhkiG9w0BAQwFAAOCAgEAOiNEPY0Idu6PvDqZ01bgAhql
# +Eg08yy25nRm95RysQDKr2wwJxMSnpBEn0v9nqN8JtU3vDpdSG2V1T9J9Ce7FoFF
# UP2cvbaF4HZ+N3HLIvdaqpDP9ZNq4+sg0dVQeYiaiorBtr2hSBh+3NiAGhEZGM1h
# mYFW9snjdufE5BtfQ/g+lP92OT2e1JnPSt0o618moZVYSNUa/tcnP/2Q0XaG3Ryw
# YFzzDaju4ImhvTnhOE7abrs2nfvlIVNaw8rpavGiPttDuDPITzgUkpn13c5Ubdld
# AhQfQDN8A+KVssIhdXNSy0bYxDQcoqVLjc1vdjcshT8azibpGL6QB7BDf5WIIIJw
# 8MzK7/0pNVwfiThV9zeKiwmhywvpMRr/LhlcOXHhvpynCgbWJme3kuZOX956rEnP
# LqR0kq3bPKSchh/jwVYbKyP/j7XqiHtwa+aguv06P0WmxOgWkVKLQcBIhEuWTatE
# QOON8BUozu3xGFYHKi8QxAwIZDwzj64ojDzLj4gLDb879M4ee47vtevLt/B3E+bn
# KD+sEq6lLyJsQfmCXBVmzGwOysWGw/YmMwwHS6DTBwJqakAwSEs0qFEgu60bhQji
# WQ1tygVQK+pKHJ6l/aCnHwZ05/LWUpD9r4VIIflXO7ScA+2GRfS0YW6/aOImYIbq
# yK+p/pQd52MbOoZWeE4wggd3MIIFX6ADAgECAhAHHxQbizANJfMU6yMM0NHdMA0G
# CSqGSIb3DQEBCwUAMGkxCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5EaWdpQ2VydCwg
# SW5jLjFBMD8GA1UEAxM4RGlnaUNlcnQgVHJ1c3RlZCBHNCBDb2RlIFNpZ25pbmcg
# UlNBNDA5NiBTSEEzODQgMjAyMSBDQTEwHhcNMjIwMTE3MDAwMDAwWhcNMjUwMTE1
# MjM1OTU5WjB8MQswCQYDVQQGEwJVUzEPMA0GA1UECBMGT3JlZ29uMRIwEAYDVQQH
# EwlCZWF2ZXJ0b24xIzAhBgNVBAoTGlB5dGhvbiBTb2Z0d2FyZSBGb3VuZGF0aW9u
# MSMwIQYDVQQDExpQeXRob24gU29mdHdhcmUgRm91bmRhdGlvbjCCAiIwDQYJKoZI
# hvcNAQEBBQADggIPADCCAgoCggIBAKgc0BTT+iKbtK6f2mr9pNMUTcAJxKdsuOiS
# YgDFfwhjQy89koM7uP+QV/gwx8MzEt3c9tLJvDccVWQ8H7mVsk/K+X+IufBLCgUi
# 0GGAZUegEAeRlSXxxhYScr818ma8EvGIZdiSOhqjYc4KnfgfIS4RLtZSrDFG2tN1
# 6yS8skFa3IHyvWdbD9PvZ4iYNAS4pjYDRjT/9uzPZ4Pan+53xZIcDgjiTwOh8VGu
# ppxcia6a7xCyKoOAGjvCyQsj5223v1/Ig7Dp9mGI+nh1E3IwmyTIIuVHyK6Lqu35
# 2diDY+iCMpk9ZanmSjmB+GMVs+H/gOiofjjtf6oz0ki3rb7sQ8fTnonIL9dyGTJ0
# ZFYKeb6BLA66d2GALwxZhLe5WH4Np9HcyXHACkppsE6ynYjTOd7+jN1PRJahN1oE
# RzTzEiV6nCO1M3U1HbPTGyq52IMFSBM2/07WTJSbOeXjvYR7aUxK9/ZkJiacl2iZ
# I7IWe7JKhHohqKuceQNyOzxTakLcRkzynvIrk33R9YVqtB4L6wtFxhUjvDnQg16x
# ot2KVPdfyPAWd81wtZADmrUtsZ9qG79x1hBdyOl4vUtVPECuyhCxaw+faVjumapP
# Unwo8ygflJJ74J+BYxf6UuD7m8yzsfXWkdv52DjL74TxzuFTLHPyARWCSCAbzn3Z
# Ily+qIqDAgMBAAGjggIGMIICAjAfBgNVHSMEGDAWgBRoN+Drtjv4XxGG+/5hewiI
# ZfROQjAdBgNVHQ4EFgQUt/1Teh2XDuUj2WW3siYWJgkZHA8wDgYDVR0PAQH/BAQD
# AgeAMBMGA1UdJQQMMAoGCCsGAQUFBwMDMIG1BgNVHR8Ega0wgaowU6BRoE+GTWh0
# dHA6Ly9jcmwzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydFRydXN0ZWRHNENvZGVTaWdu
# aW5nUlNBNDA5NlNIQTM4NDIwMjFDQTEuY3JsMFOgUaBPhk1odHRwOi8vY3JsNC5k
# aWdpY2VydC5jb20vRGlnaUNlcnRUcnVzdGVkRzRDb2RlU2lnbmluZ1JTQTQwOTZT
# SEEzODQyMDIxQ0ExLmNybDA+BgNVHSAENzA1MDMGBmeBDAEEATApMCcGCCsGAQUF
# BwIBFhtodHRwOi8vd3d3LmRpZ2ljZXJ0LmNvbS9DUFMwgZQGCCsGAQUFBwEBBIGH
# MIGEMCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wXAYIKwYB
# BQUHMAKGUGh0dHA6Ly9jYWNlcnRzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydFRydXN0
# ZWRHNENvZGVTaWduaW5nUlNBNDA5NlNIQTM4NDIwMjFDQTEuY3J0MAwGA1UdEwEB
# /wQCMAAwDQYJKoZIhvcNAQELBQADggIBABxv4AeV/5ltkELHSC63fXAFYS5tadcW
# TiNc2rskrNLrfH1Ns0vgSZFoQxYBFKI159E8oQQ1SKbTEubZ/B9kmHPhprHya08+
# VVzxC88pOEvz68nA82oEM09584aILqYmj8Pj7h/kmZNzuEL7WiwFa/U1hX+XiWfL
# IJQsAHBla0i7QRF2de8/VSF0XXFa2kBQ6aiTsiLyKPNbaNtbcucaUdn6vVUS5izW
# OXM95BSkFSKdE45Oq3FForNJXjBvSCpwcP36WklaHL+aHu1upIhCTUkzTHMh8b86
# WmjRUqbrnvdyR2ydI5l1OqcMBjkpPpIV6wcc+KY/RH2xvVuuoHjlUjwq2bHiNoX+
# W1scCpnA8YTs2d50jDHUgwUo+ciwpffH0Riq132NFmrH3r67VaN3TuBxjI8SIZM5
# 8WEDkbeoriDk3hxU8ZWV7b8AW6oyVBGfM06UgkfMb58h+tJPrFx8VI/WLq1dTqMf
# ZOm5cuclMnUHs2uqrRNtnV8UfidPBL4ZHkTcClQbCoz0UbLhkiDvIS00Dn+BBcxw
# /TKqVL4Oaz3bkMSsM46LciTeucHY9ExRVt3zy7i149sd+F4QozPqn7FrSVHXmem3
# r7bjyHTxOgqxRCVa18Vtx7P/8bYSBeS+WHCKcliFCecspusCDSlnRUjZwyPdP0VH
# xaZg2unjHY3rMYIVJjCCFSICAQEwfTBpMQswCQYDVQQGEwJVUzEXMBUGA1UEChMO
# RGlnaUNlcnQsIEluYy4xQTA/BgNVBAMTOERpZ2lDZXJ0IFRydXN0ZWQgRzQgQ29k
# ZSBTaWduaW5nIFJTQTQwOTYgU0hBMzg0IDIwMjEgQ0ExAhAHHxQbizANJfMU6yMM
# 0NHdMA0GCWCGSAFlAwQCAQUAoIHEMBkGCSqGSIb3DQEJAzEMBgorBgEEAYI3AgEE
# MBwGCisGAQQBgjcCAQsxDjAMBgorBgEEAYI3AgEVMC8GCSqGSIb3DQEJBDEiBCBn
# AZ6P7YvTwq0fbF62o7E75R0LxsW5OtyYiFESQckLhjBYBgorBgEEAYI3AgEMMUow
# SKBGgEQAQgB1AGkAbAB0ADoAIABSAGUAbABlAGEAcwBlAF8AdgAzAC4AMQAwAC4A
# NQBfADIAMAAyADIAMAA2ADAANgAuADAAMTANBgkqhkiG9w0BAQEFAASCAgA5LMM8
# 8+phW11oF/PTFxitR3oW7QHlGHA97n1MCieor042JtmqUyqqf7ykapKc/ND4pVDP
# DP8nhIeXuLd2/SHqqf6CLZX9yacAFPDCV/MtYhlw4yKwa2ECw9EDDwB670UwUW/j
# IUl+fSrWagwH2WC7T5iMiV7uEZU4koGuOS4SiDzRLwTcuRtY6N/FYerQhioHXzdX
# vO76qXnj4UIDWnWbSWLgPDo8g4xonm7BC0dFRn4WW8tgm/StxQ/TBS4L2O/LEjYy
# pSLEXOy0INrA5CqWd4J4dpOhkQng1UJoySCL9Q2ceyv1U3SrywLY4rLwmSrZYsbQ
# OpnL+P1DP/eHYPbcwQEhbaTj81ULMxNDnouXJMm6ErMgTRH6TTpDcuPI8qlqkT2E
# DGZ4pPdZSHxDYkocJ6REh1YKlpvdHaGQFkXuc3p2lG/siv2rtDefI4wChN4VOHZG
# ia6G3FZaIyqFW/0sFz5KOzxoxcjfzyO76SSJx9jYpuOmPrHihaOlFjzZGxnWwFdM
# l3uCD+QeJL2bkl7npoyW0RRznBUUj21psHdVN5vzK+Gsyr22A9lS1XaX3a2KJ6bl
# Krkj+PObW5dtxvso0bQss2FCFdOATk4AlFcmk6bWk8rZm+w4e9NugsCTI+IE45hL
# AEyzTjc21JqGt8l2Rn/eElRHgsjvNpO4H5FFo6GCEbMwghGvBgorBgEEAYI3AwMB
# MYIRnzCCEZsGCSqGSIb3DQEHAqCCEYwwghGIAgEDMQ8wDQYJYIZIAWUDBAIBBQAw
# eAYLKoZIhvcNAQkQAQSgaQRnMGUCAQEGCWCGSAGG/WwHATAxMA0GCWCGSAFlAwQC
# AQUABCDX6Ys0ehzU7Uygr+TZMXB4pMkJvCegnm5JrODTttrXZwIRAMaBOV1Pb1sY
# w0ypALrk6u8YDzIwMjIwNjA2MTYyMjEwWqCCDXwwggbGMIIErqADAgECAhAKekqI
# nsmZQpAGYzhNhpedMA0GCSqGSIb3DQEBCwUAMGMxCzAJBgNVBAYTAlVTMRcwFQYD
# VQQKEw5EaWdpQ2VydCwgSW5jLjE7MDkGA1UEAxMyRGlnaUNlcnQgVHJ1c3RlZCBH
# NCBSU0E0MDk2IFNIQTI1NiBUaW1lU3RhbXBpbmcgQ0EwHhcNMjIwMzI5MDAwMDAw
# WhcNMzMwMzE0MjM1OTU5WjBMMQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNl
# cnQsIEluYy4xJDAiBgNVBAMTG0RpZ2lDZXJ0IFRpbWVzdGFtcCAyMDIyIC0gMjCC
# AiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBALkqliOmXLxf1knwFYIY9DPu
# zFxs4+AlLtIx5DxArvurxON4XX5cNur1JY1Do4HrOGP5PIhp3jzSMFENMQe6Rm7p
# o0tI6IlBfw2y1vmE8Zg+C78KhBJxbKFiJgHTzsNs/aw7ftwqHKm9MMYW2Nq867Lx
# g9GfzQnFuUFqRUIjQVr4YNNlLD5+Xr2Wp/D8sfT0KM9CeR87x5MHaGjlRDRSXw9Q
# 3tRZLER0wDJHGVvimC6P0Mo//8ZnzzyTlU6E6XYYmJkRFMUrDKAz200kheiClOEv
# A+5/hQLJhuHVGBS3BEXz4Di9or16cZjsFef9LuzSmwCKrB2NO4Bo/tBZmCbO4O2u
# fyguwp7gC0vICNEyu4P6IzzZ/9KMu/dDI9/nw1oFYn5wLOUrsj1j6siugSBrQ4nI
# fl+wGt0ZvZ90QQqvuY4J03ShL7BUdsGQT5TshmH/2xEvkgMwzjC3iw9dRLNDHSNQ
# zZHXL537/M2xwafEDsTvQD4ZOgLUMalpoEn5deGb6GjkagyP6+SxIXuGZ1h+fx/o
# K+QUshbWgaHK2jCQa+5vdcCwNiayCDv/vb5/bBMY38ZtpHlJrYt/YYcFaPfUcONC
# leieu5tLsuK2QT3nr6caKMmtYbCgQRgZTu1Hm2GV7T4LYVrqPnqYklHNP8lE54CL
# KUJy93my3YTqJ+7+fXprAgMBAAGjggGLMIIBhzAOBgNVHQ8BAf8EBAMCB4AwDAYD
# VR0TAQH/BAIwADAWBgNVHSUBAf8EDDAKBggrBgEFBQcDCDAgBgNVHSAEGTAXMAgG
# BmeBDAEEAjALBglghkgBhv1sBwEwHwYDVR0jBBgwFoAUuhbZbU2FL3MpdpovdYxq
# II+eyG8wHQYDVR0OBBYEFI1kt4kh/lZYRIRhp+pvHDaP3a8NMFoGA1UdHwRTMFEw
# T6BNoEuGSWh0dHA6Ly9jcmwzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydFRydXN0ZWRH
# NFJTQTQwOTZTSEEyNTZUaW1lU3RhbXBpbmdDQS5jcmwwgZAGCCsGAQUFBwEBBIGD
# MIGAMCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wWAYIKwYB
# BQUHMAKGTGh0dHA6Ly9jYWNlcnRzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydFRydXN0
# ZWRHNFJTQTQwOTZTSEEyNTZUaW1lU3RhbXBpbmdDQS5jcnQwDQYJKoZIhvcNAQEL
# BQADggIBAA0tI3Sm0fX46kuZPwHk9gzkrxad2bOMl4IpnENvAS2rOLVwEb+EGYs/
# XeWGT76TOt4qOVo5TtiEWaW8G5iq6Gzv0UhpGThbz4k5HXBw2U7fIyJs1d/2Wcuh
# wupMdsqh3KErlribVakaa33R9QIJT4LWpXOIxJiA3+5JlbezzMWn7g7h7x44ip/v
# EckxSli23zh8y/pc9+RTv24KfH7X3pjVKWWJD6KcwGX0ASJlx+pedKZbNZJQfPQX
# podkTz5GiRZjIGvL8nvQNeNKcEiptucdYL0EIhUlcAZyqUQ7aUcR0+7px6A+TxC5
# MDbk86ppCaiLfmSiZZQR+24y8fW7OK3NwJMR1TJ4Sks3KkzzXNy2hcC7cDBVeNaY
# /lRtf3GpSBp43UZ3Lht6wDOK+EoojBKoc88t+dMj8p4Z4A2UKKDr2xpRoJWCjihr
# pM6ddt6pc6pIallDrl/q+A8GQp3fBmiW/iqgdFtjZt5rLLh4qk1wbfAs8QcVfjW0
# 5rUMopml1xVrNQ6F1uAszOAMJLh8UgsemXzvyMjFjFhpr6s94c/MfRWuFL+Kcd/K
# l7HYR+ocheBFThIcFClYzG/Tf8u+wQ5KbyCcrtlzMlkI5y2SoRoR/jKYpl0rl+CL
# 05zMbbUNrkdjOEcXW28T2moQbh9Jt0RbtAgKh1pZBHYRoad3AhMcMIIGrjCCBJag
# AwIBAgIQBzY3tyRUfNhHrP0oZipeWzANBgkqhkiG9w0BAQsFADBiMQswCQYDVQQG
# EwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNl
# cnQuY29tMSEwHwYDVQQDExhEaWdpQ2VydCBUcnVzdGVkIFJvb3QgRzQwHhcNMjIw
# MzIzMDAwMDAwWhcNMzcwMzIyMjM1OTU5WjBjMQswCQYDVQQGEwJVUzEXMBUGA1UE
# ChMORGlnaUNlcnQsIEluYy4xOzA5BgNVBAMTMkRpZ2lDZXJ0IFRydXN0ZWQgRzQg
# UlNBNDA5NiBTSEEyNTYgVGltZVN0YW1waW5nIENBMIICIjANBgkqhkiG9w0BAQEF
# AAOCAg8AMIICCgKCAgEAxoY1BkmzwT1ySVFVxyUDxPKRN6mXUaHW0oPRnkyibaCw
# zIP5WvYRoUQVQl+kiPNo+n3znIkLf50fng8zH1ATCyZzlm34V6gCff1DtITaEfFz
# sbPuK4CEiiIY3+vaPcQXf6sZKz5C3GeO6lE98NZW1OcoLevTsbV15x8GZY2UKdPZ
# 7Gnf2ZCHRgB720RBidx8ald68Dd5n12sy+iEZLRS8nZH92GDGd1ftFQLIWhuNyG7
# QKxfst5Kfc71ORJn7w6lY2zkpsUdzTYNXNXmG6jBZHRAp8ByxbpOH7G1WE15/teP
# c5OsLDnipUjW8LAxE6lXKZYnLvWHpo9OdhVVJnCYJn+gGkcgQ+NDY4B7dW4nJZCY
# OjgRs/b2nuY7W+yB3iIU2YIqx5K/oN7jPqJz+ucfWmyU8lKVEStYdEAoq3NDzt9K
# oRxrOMUp88qqlnNCaJ+2RrOdOqPVA+C/8KI8ykLcGEh/FDTP0kyr75s9/g64ZCr6
# dSgkQe1CvwWcZklSUPRR8zZJTYsg0ixXNXkrqPNFYLwjjVj33GHek/45wPmyMKVM
# 1+mYSlg+0wOI/rOP015LdhJRk8mMDDtbiiKowSYI+RQQEgN9XyO7ZONj4KbhPvbC
# dLI/Hgl27KtdRnXiYKNYCQEoAA6EVO7O6V3IXjASvUaetdN2udIOa5kM0jO0zbEC
# AwEAAaOCAV0wggFZMBIGA1UdEwEB/wQIMAYBAf8CAQAwHQYDVR0OBBYEFLoW2W1N
# hS9zKXaaL3WMaiCPnshvMB8GA1UdIwQYMBaAFOzX44LScV1kTN8uZz/nupiuHA9P
# MA4GA1UdDwEB/wQEAwIBhjATBgNVHSUEDDAKBggrBgEFBQcDCDB3BggrBgEFBQcB
# AQRrMGkwJAYIKwYBBQUHMAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBBBggr
# BgEFBQcwAoY1aHR0cDovL2NhY2VydHMuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0VHJ1
# c3RlZFJvb3RHNC5jcnQwQwYDVR0fBDwwOjA4oDagNIYyaHR0cDovL2NybDMuZGln
# aWNlcnQuY29tL0RpZ2lDZXJ0VHJ1c3RlZFJvb3RHNC5jcmwwIAYDVR0gBBkwFzAI
# BgZngQwBBAIwCwYJYIZIAYb9bAcBMA0GCSqGSIb3DQEBCwUAA4ICAQB9WY7Ak7Zv
# mKlEIgF+ZtbYIULhsBguEE0TzzBTzr8Y+8dQXeJLKftwig2qKWn8acHPHQfpPmDI
# 2AvlXFvXbYf6hCAlNDFnzbYSlm/EUExiHQwIgqgWvalWzxVzjQEiJc6VaT9Hd/ty
# dBTX/6tPiix6q4XNQ1/tYLaqT5Fmniye4Iqs5f2MvGQmh2ySvZ180HAKfO+ovHVP
# ulr3qRCyXen/KFSJ8NWKcXZl2szwcqMj+sAngkSumScbqyQeJsG33irr9p6xeZmB
# o1aGqwpFyd/EjaDnmPv7pp1yr8THwcFqcdnGE4AJxLafzYeHJLtPo0m5d2aR8XKc
# 6UsCUqc3fpNTrDsdCEkPlM05et3/JWOZJyw9P2un8WbDQc1PtkCbISFA0LcTJM3c
# HXg65J6t5TRxktcma+Q4c6umAU+9Pzt4rUyt+8SVe+0KXzM5h0F4ejjpnOHdI/0d
# KNPH+ejxmF/7K9h+8kaddSweJywm228Vex4Ziza4k9Tm8heZWcpw8De/mADfIBZP
# J/tgZxahZrrdVcA6KYawmKAr7ZVBtzrVFZgxtGIJDwq9gdkT/r+k0fNX2bwE+oLe
# Mt8EifAAzV3C+dAjfwAL5HYCJtnwZXZCpimHCUcr5n8apIUP/JiW9lVUKx+A+sDy
# Divl1vupL0QVSucTDh3bNzgaoSv27dZ8/DGCA3YwggNyAgEBMHcwYzELMAkGA1UE
# BhMCVVMxFzAVBgNVBAoTDkRpZ2lDZXJ0LCBJbmMuMTswOQYDVQQDEzJEaWdpQ2Vy
# dCBUcnVzdGVkIEc0IFJTQTQwOTYgU0hBMjU2IFRpbWVTdGFtcGluZyBDQQIQCnpK
# iJ7JmUKQBmM4TYaXnTANBglghkgBZQMEAgEFAKCB0TAaBgkqhkiG9w0BCQMxDQYL
# KoZIhvcNAQkQAQQwHAYJKoZIhvcNAQkFMQ8XDTIyMDYwNjE2MjIxMFowKwYLKoZI
# hvcNAQkQAgwxHDAaMBgwFgQUhQjzhlFcs9MHfba0t8B/G0peQd4wLwYJKoZIhvcN
# AQkEMSIEIOf/YoAGTg8y0pigG0kgexHa3asvnqD00Uf8JB3uQ5TUMDcGCyqGSIb3
# DQEJEAIvMSgwJjAkMCIEIJ2mkBXDScbBiXhFujWCrXDIj6QpO9tqvpwr0lOSeeY7
# MA0GCSqGSIb3DQEBAQUABIICALVOybzMu47x8CdSSeAuaV/YXzBq1oDqNnX+Fry/
# 7C7TpHKVn58SKdFgeNmneBuqBqlZ2qyO9h02ZercH2d3GfALKuEmcUcp/Ik6RqQR
# INN76QLhzFeIiIdBGvcHI2hcx3OAgtenpe+4V2oWa05cJf5exXQ9ja59aNB0sf5j
# GyyHgmPhRK6itjp7xoSOw5zY4NN91viV2DX23b0SiL3oB5bAzgL77RLydmgg4XIW
# 9vxqyCK8XM4imdLfnI0J+Sw7QBLk5Pw1jp/x0YNbHlk5ojA06ehufF0smFdgjMBZ
# eefNH+lXfdVBeml8j3rNNbGsQ+d6+xXmUUVnNAGwK8QH5LpCqe+7H0r3yFsBCoxI
# XaAPC9EPQVMYyPFyzh8Omu5RHQaeIARZvTyzk3BzjyJmDypOcy3s1a4YG0lsO8+b
# cI925YMstRe3/gWSfZj8Q4OXFpeJxQ1b4w1slH116IrtjR9FC+N9OEWMggi4YQQf
# V6DPuNmv9d4JMR/vwxU4XmvHG/HnbFyFrpFmlRpSTExv3XNQWcdSn0FneKw1evvZ
# RRHow/HShcRnIPRqfhnqlQNxUKLt9bmWnRXLkaNCtiowSJ82v9XnTboZunXbMSb0
# dM5FF5o4xTVoyp6P0O2qF2QtaXU03P8MDNOD1sWFSWhi64FWnmXuIaAuJKn05ZgC
# hIIC
# SIG # End signature block

View File

@ -0,0 +1,69 @@
# This file must be used with "source bin/activate" *from bash*
# you cannot run it directly
deactivate () {
# reset old environment variables
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
PATH="${_OLD_VIRTUAL_PATH:-}"
export PATH
unset _OLD_VIRTUAL_PATH
fi
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
export PYTHONHOME
unset _OLD_VIRTUAL_PYTHONHOME
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
PS1="${_OLD_VIRTUAL_PS1:-}"
export PS1
unset _OLD_VIRTUAL_PS1
fi
unset VIRTUAL_ENV
unset VIRTUAL_ENV_PROMPT
if [ ! "${1:-}" = "nondestructive" ] ; then
# Self destruct!
unset -f deactivate
fi
}
# unset irrelevant variables
deactivate nondestructive
VIRTUAL_ENV="C:\Users\samhi\spaCy\spacy\pipeline\logreg\myenv"
export VIRTUAL_ENV
_OLD_VIRTUAL_PATH="$PATH"
PATH="$VIRTUAL_ENV/Scripts:$PATH"
export PATH
# unset PYTHONHOME if set
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
# could use `if (set -u; : $PYTHONHOME) ;` in bash
if [ -n "${PYTHONHOME:-}" ] ; then
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
unset PYTHONHOME
fi
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
_OLD_VIRTUAL_PS1="${PS1:-}"
PS1="(myenv) ${PS1:-}"
export PS1
VIRTUAL_ENV_PROMPT="(myenv) "
export VIRTUAL_ENV_PROMPT
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,3 @@
home = C:\Python310
include-system-site-packages = false
version = 3.10.5

View File

@ -0,0 +1,224 @@
from typing import List, Dict, Iterable
import numpy as np
from spacy.pipeline import TrainablePipe
from spacy.language import Language
from spacy.training import Example
from spacy.vocab import Vocab
from spacy.tokens import Doc
@Language.factory(
"pure_logistic_textcat",
default_config={
"learning_rate": 0.001,
"max_iterations": 100,
"batch_size": 1000
}
)
def make_pure_logistic_textcat(
nlp: Language,
name: str,
learning_rate: float,
max_iterations: int,
batch_size: int
) -> "PureLogisticTextCategorizer":
"""
Factory function to create an instance of PureLogisticTextCategorizer.
:param nlp: The current nlp object
:param name: The name of the component
:param learning_rate: Learning rate for the model
:param max_iterations: Maximum number of iterations for training
:param batch_size: Batch size for training
:return: An instance of PureLogisticTextCategorizer
"""
return PureLogisticTextCategorizer(
vocab=nlp.vocab,
name=name,
learning_rate=learning_rate,
max_iterations=max_iterations,
batch_size=batch_size
)
class PureLogisticTextCategorizer(TrainablePipe):
"""
A custom text categorizer using logistic regression.
"""
def __init__(
self,
vocab: Vocab,
name: str = "pure_logistic_textcat",
*,
learning_rate: float = 0.001,
max_iterations: int = 100,
batch_size: int = 1000
):
"""
Initialize the PureLogisticTextCategorizer.
:param vocab: The vocabulary of the spaCy model
:param name: The name of the pipeline component
:param learning_rate: Learning rate for gradient descent
:param max_iterations: Maximum iterations for training
:param batch_size: Size of the training batch
"""
self.vocab = vocab
self.name = name
self.learning_rate = learning_rate
self.max_iterations = max_iterations
self.batch_size = batch_size
self.weights = None # Initialize weights to None
self.bias = None # Initialize bias to None
self._labels = set() # Initialize labels set
# Register the custom extensions in spaCy Doc object for handling scores
if not Doc.has_extension("textcat_scores"):
Doc.set_extension("textcat_scores", default={})
if not Doc.has_extension("cats"):
Doc.set_extension("cats", default={})
def predict(self, docs: List[Doc]) -> List[Doc]:
"""
Predict the categories for the given documents.
:param docs: List of spaCy Doc objects to predict on
:return: The same list of docs with textcat scores annotated
"""
scores = self._predict_scores(docs) # Get predicted scores
self.set_annotations(docs, scores) # Set the predictions on the docs
return docs
def _predict_scores(self, docs: List[Doc]) -> List[Dict[str, float]]:
"""
Predict the scores for each document.
:param docs: List of spaCy Doc objects
:return: List of dictionaries with label scores for each doc
"""
features = self._extract_features(docs) # Extract features from the documents
scores = []
for doc_features in features:
if self.weights is None:
# If weights are not initialized, assign 0.5 (neutral probability) to each label
doc_scores = {label: 0.5 for label in self.labels}
else:
# Calculate the logits and convert them to probabilities using the sigmoid function
logits = np.dot(doc_features, self.weights) + self.bias
probs = 1 / (1 + np.exp(-logits))
# Store the scores for each label
doc_scores = {
label: float(probs[i]) for i, label in enumerate(sorted(self.labels))
}
scores.append(doc_scores)
return scores
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd=None,
losses=None
) -> Dict[str, float]:
"""
Update the model using the provided training examples.
:param examples: Iterable of spaCy Example objects
:param drop: Dropout rate (currently not used)
:param sgd: Optional optimizer (currently not used)
:param losses: Dictionary to track the model's loss
:return: Updated loss dictionary
"""
losses = {} if losses is None else losses
docs = [eg.reference for eg in examples]
features = self._extract_features(docs)
sorted_labels = sorted(self.labels)
labels = np.array([
[eg.reference.cats.get(label, 0.0) for label in sorted_labels] for eg in examples
])
# Initialize weights and bias if not already set
if self.weights is None:
n_features = len(features[0])
self.weights = np.zeros((n_features, len(self.labels)))
self.bias = np.zeros(len(self.labels))
# Training loop
total_loss = 0.0
features = np.array(features)
for _ in range(self.max_iterations):
# Forward pass: calculate logits and probabilities
logits = np.dot(features, self.weights) + self.bias
probs = 1 / (1 + np.exp(-logits))
# Calculate loss using binary cross-entropy
loss = -np.mean(
labels * np.log(probs + 1e-8) +
(1 - labels) * np.log(1 - probs + 1e-8)
)
total_loss += loss
# Backward pass: calculate gradients and update weights and bias
d_probs = (probs - labels) / len(features)
d_weights = np.dot(features.T, d_probs)
d_bias = np.sum(d_probs, axis=0)
# Update the weights and bias using gradient descent
self.weights -= self.learning_rate * d_weights
self.bias -= self.learning_rate * d_bias
# Average loss over the iterations
losses[self.name] = total_loss / self.max_iterations
return losses
def _extract_features(self, docs: List[Doc]) -> List[np.ndarray]:
"""
Extract features from the documents.
:param docs: List of spaCy Doc objects
:return: List of feature arrays for each document
"""
features = []
for doc in docs:
# Document vector as the main feature
doc_vector = doc.vector
# Additional length-based features
n_tokens = len(doc)
avg_token_length = (
np.mean([len(token.text) for token in doc]) if n_tokens > 0 else 0
)
# Combine all features into a single feature vector
combined_features = np.concatenate([
doc_vector,
[n_tokens / 100.0, avg_token_length / 10.0] # Scale the features
])
features.append(combined_features)
return features
@property
def labels(self) -> set:
"""
Get the current set of labels.
:return: Set of labels
"""
return self._labels
@labels.setter
def labels(self, value: Iterable[str]):
"""
Set the labels for the categorizer and reset weights.
:param value: Iterable of label strings
"""
self._labels = set(value)
# Reset weights and bias when labels change
self.weights = None
self.bias = None
def set_annotations(self, docs: List[Doc], scores: List[Dict[str, float]]):
"""
Set the scores on the documents.
:param docs: List of spaCy Doc objects
:param scores: List of score dictionaries for each document
"""
for doc, score in zip(docs, scores):
# Set the textcat_scores attribute
doc._.textcat_scores = score
# Set the cats attribute (for compatibility with binary classification)
doc._.cats = score

View File

@ -0,0 +1,225 @@
import pytest
import numpy as np
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.training import Example
# Define the NLP fixture for testing
@pytest.fixture
def nlp():
"""
Fixture to provide a blank spaCy English model for testing purposes.
"""
return spacy.blank("en")
@Language.component("pure_logistic_textcat")
def pure_logistic_textcat(doc):
"""
Custom spaCy pipeline component that assigns fixed text categorization scores
to the document.
Args:
doc (Doc): The spaCy document to process.
Returns:
Doc: The processed document with 'textcat_scores' attribute set.
"""
# Placeholder for text categorization scores
scores = {"positive": 0.5, "negative": 0.5}
# Ensure the 'textcat_scores' extension exists
if not Doc.has_extension("textcat_scores"):
Doc.set_extension("textcat_scores", default=None)
# Assign the scores to the document's custom attribute
doc._.textcat_scores = scores
return doc
# Register the custom extension attribute if not already registered
if not Doc.has_extension("textcat_scores"):
Doc.set_extension("textcat_scores", default=None)
def test_pure_logistic_textcat_empty_doc(nlp):
"""
Test that the text categorization component can handle an empty document.
"""
nlp.add_pipe("pure_logistic_textcat")
doc = nlp("")
assert doc._.textcat_scores is not None
assert isinstance(doc._.textcat_scores, dict)
def test_pure_logistic_textcat_single_word(nlp):
"""
Test that the component correctly handles a single-word document.
"""
nlp.add_pipe("pure_logistic_textcat")
doc = nlp("positive")
assert doc._.textcat_scores is not None
assert isinstance(doc._.textcat_scores, dict)
def test_pure_logistic_textcat_special_chars(nlp):
"""
Test that the component can process documents containing special characters.
"""
nlp.add_pipe("pure_logistic_textcat")
doc = nlp("!@#$%^&*()")
assert doc._.textcat_scores is not None
assert isinstance(doc._.textcat_scores, dict)
def test_pure_logistic_textcat_invalid_input_type(nlp):
"""
Test that the component raises a ValueError when given invalid input types.
"""
with pytest.raises(ValueError):
nlp.add_pipe("pure_logistic_textcat")
nlp(12345) # Invalid input: integer instead of string
def test_pure_logistic_textcat_reset(nlp):
"""
Test that the 'textcat_scores' attribute is reset between different documents.
"""
nlp.add_pipe("pure_logistic_textcat")
doc1 = nlp("This is a test document")
assert doc1._.textcat_scores is not None
doc2 = nlp("Another test")
assert doc2._.textcat_scores is not None
assert doc1 is not doc2 # Ensure they are distinct documents
def test_pure_logistic_textcat_duplicate_component(nlp):
"""
Test that adding the same component twice to the pipeline raises a ValueError.
"""
nlp.add_pipe("pure_logistic_textcat")
with pytest.raises(ValueError):
nlp.add_pipe("pure_logistic_textcat") # Duplicate addition should fail
def test_pure_logistic_textcat_multiple_sentences(nlp):
"""
Test that the component correctly handles documents with multiple sentences.
"""
nlp.add_pipe("pure_logistic_textcat")
doc = nlp("This is the first sentence. This is the second.")
assert doc._.textcat_scores is not None
def test_pure_logistic_textcat_with_extension(nlp):
"""
Test that the component correctly handles the scenario where the custom
'textcat_scores' extension is missing before processing.
"""
# Remove the extension if it exists
if Doc.has_extension("textcat_scores"):
Doc.remove_extension("textcat_scores")
# Add the custom component
nlp.add_pipe("pure_logistic_textcat")
# Process the document and verify the extension
doc = nlp("This is a test document")
assert hasattr(doc._, "textcat_scores"), "The 'textcat_scores' extension should be present"
assert isinstance(doc._.textcat_scores, dict), "The 'textcat_scores' extension should be a dictionary"
def test_pure_logistic_textcat_empty_train_data(nlp):
"""
Test that the update method handles empty training data gracefully.
"""
def mock_update(examples):
return {"pure_logistic_textcat": 0.0}
textcat = nlp.add_pipe("pure_logistic_textcat")
textcat.update = mock_update
losses = textcat.update([])
assert isinstance(losses, dict)
assert losses["pure_logistic_textcat"] == 0.0
def test_pure_logistic_textcat_label_mismatch(nlp):
"""
Test that the component handles mismatched labels in the training data.
"""
textcat = nlp.add_pipe("pure_logistic_textcat")
# Mismatched label in the training data
train_examples = []
for text, annotations in TRAIN_DATA_MISMATCH:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
train_examples.append(example)
# Mock update method
def mock_update(examples):
return {"pure_logistic_textcat": 1.0} # Arbitrary loss
textcat.update = mock_update
losses = textcat.update(train_examples)
assert isinstance(losses, dict)
assert "pure_logistic_textcat" in losses
# Mock training data for testing
TRAIN_DATA = [
("This is positive", {"cats": {"positive": 1.0, "negative": 0.0}}),
("This is negative", {"cats": {"positive": 0.0, "negative": 1.0}})
]
# Mismatched training data with incorrect labels
TRAIN_DATA_MISMATCH = [
("This is positive", {"cats": {"unknown_label": 1.0, "negative": 0.0}}),
("This is negative", {"cats": {"positive": 0.0, "unknown_label": 1.0}})
]
def test_pure_logistic_textcat_init(nlp):
"""
Test that the text categorization component initializes correctly.
"""
textcat = nlp.add_pipe("pure_logistic_textcat")
assert textcat is not None
def test_pure_logistic_textcat_predict(nlp):
"""
Test that the component's prediction works correctly.
"""
nlp.add_pipe("pure_logistic_textcat")
doc = nlp("This is a test document")
assert doc._.textcat_scores is not None
assert isinstance(doc._.textcat_scores, dict)
assert "positive" in doc._.textcat_scores
assert "negative" in doc._.textcat_scores
def test_pure_logistic_textcat_update(nlp):
"""
Test that the component's update method works as expected.
"""
def mock_update(examples):
losses = {"pure_logistic_textcat": 0.5} # Dummy loss value
return losses
textcat = nlp.add_pipe("pure_logistic_textcat")
textcat.update = mock_update
train_examples = []
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
train_examples.append(example)
losses = textcat.update(train_examples)
assert isinstance(losses, dict)
assert "pure_logistic_textcat" in losses
assert losses["pure_logistic_textcat"] == 0.5 # Ensure the loss is correct

View File

@ -1,72 +0,0 @@
import pytest
from spacy.language import Language
from spacy.training import Example
import spacy
from spacy.tokens import Doc
import numpy as np
# Define the nlp fixture
@pytest.fixture
def nlp():
# Load the spaCy model
return spacy.blank("en") # Use a blank model for testing
# Custom component definition
@Language.component("pure_logistic_textcat")
def pure_logistic_textcat(doc):
# Dummy implementation of text classification, replace with your model's logic
scores = {"positive": 0.5, "negative": 0.5}
# Store the scores in a custom attribute on the doc
doc._.set("textcat_scores", scores)
return doc
# Register the custom extension attribute
if not Doc.has_extension("textcat_scores"):
Doc.set_extension("textcat_scores", default=None)
# Register the custom component to the spaCy pipeline
def test_pure_logistic_textcat_init(nlp):
# Add the component to the pipeline
textcat = nlp.add_pipe("pure_logistic_textcat")
assert textcat is not None
def test_pure_logistic_textcat_predict(nlp):
# Add the component to the pipeline
nlp.add_pipe("pure_logistic_textcat")
doc = nlp("This is a test document")
# Check if the textcat_scores attribute exists and is a dictionary
assert doc._.textcat_scores is not None
assert isinstance(doc._.textcat_scores, dict)
assert "positive" in doc._.textcat_scores
assert "negative" in doc._.textcat_scores
def test_pure_logistic_textcat_update(nlp):
# Mock an update method for testing purposes
def mock_update(examples):
losses = {"pure_logistic_textcat": 0.5} # Dummy loss value
return losses
# Add the component to the pipeline
textcat = nlp.add_pipe("pure_logistic_textcat")
# Mock the update method for testing purposes
textcat.update = mock_update
train_examples = []
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
train_examples.append(example)
# Update the model
losses = textcat.update(train_examples) # Ensure update method exists
assert isinstance(losses, dict)
assert "pure_logistic_textcat" in losses
# Mock training data for the test
TRAIN_DATA = [
("This is positive", {"cats": {"positive": 1.0, "negative": 0.0}}),
("This is negative", {"cats": {"positive": 0.0, "negative": 1.0}})
]

View File

@ -1,170 +0,0 @@
from typing import List, Dict, Iterable
import numpy as np
from spacy.pipeline import TrainablePipe
from spacy.language import Language
from spacy.training import Example
from spacy.vocab import Vocab
from spacy.tokens import Doc
@Language.factory(
"pure_logistic_textcat",
default_config={
"learning_rate": 0.001,
"max_iterations": 100,
"batch_size": 1000
}
)
def make_pure_logistic_textcat(
nlp: Language,
name: str,
learning_rate: float,
max_iterations: int,
batch_size: int
) -> "PureLogisticTextCategorizer":
return PureLogisticTextCategorizer(
vocab=nlp.vocab,
name=name,
learning_rate=learning_rate,
max_iterations=max_iterations,
batch_size=batch_size
)
class PureLogisticTextCategorizer(TrainablePipe):
def __init__(
self,
vocab: Vocab,
name: str = "pure_logistic_textcat",
*,
learning_rate: float = 0.001,
max_iterations: int = 100,
batch_size: int = 1000
):
"""Initialize the text categorizer."""
self.vocab = vocab
self.name = name
self.learning_rate = learning_rate
self.max_iterations = max_iterations
self.batch_size = batch_size
self.weights = None
self.bias = 0.0
self._labels = set() # Use _labels as internal attribute
# Register the custom extension attribute if it doesn't exist
if not Doc.has_extension("textcat_scores"):
Doc.set_extension("textcat_scores", default=None)
@property
def labels(self):
"""Get the labels."""
return self._labels
@labels.setter
def labels(self, value):
"""Set the labels."""
self._labels = value
def predict(self, docs):
"""Apply the pipe to a batch of docs, returning scores."""
scores = self._predict_scores(docs)
for doc, doc_scores in zip(docs, scores):
doc._.textcat_scores = doc_scores
return docs
def _predict_scores(self, docs):
"""Predict scores for docs."""
features = self._extract_features(docs)
scores = []
for doc_features in features:
if self.weights is None:
doc_scores = {"positive": 0.5, "negative": 0.5}
else:
logits = np.dot(doc_features, self.weights) + self.bias
prob = 1 / (1 + np.exp(-logits))
doc_scores = {
"positive": float(prob),
"negative": float(1 - prob)
}
scores.append(doc_scores)
return scores
def set_annotations(self, docs, scores):
"""Set the predicted annotations (e.g. categories) on the docs."""
for doc, score in zip(docs, scores):
doc.cats = {label: score[i] for i, label in enumerate(self._labels)}
def _extract_features(self, docs) -> List[np.ndarray]:
"""Extract features from docs."""
features = []
for doc in docs:
# Basic features
doc_vector = doc.vector
n_tokens = len(doc)
# Additional features
n_entities = len(doc.ents)
avg_token_length = np.mean([len(token.text) for token in doc])
n_stopwords = len([token for token in doc if token.is_stop])
# Combine features
doc_features = np.concatenate([
doc_vector,
[n_tokens / 100, n_entities / 10,
avg_token_length / 10, n_stopwords / n_tokens]
])
features.append(doc_features)
return features
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd=None,
losses: Dict[str, float] = None
) -> Dict[str, float]:
"""Update the model."""
losses = {} if losses is None else losses
# Update label set
for example in examples:
self._labels.update(example.reference.cats.keys())
# Extract features and labels
docs = [example.reference for example in examples]
label_arrays = self._make_label_array([example.reference.cats for example in examples])
features = self._extract_features(docs)
if self.weights is None:
n_features = features[0].shape[0] if features else 0
self.weights = np.zeros((n_features, 1))
# Simple gradient descent
total_loss = 0.0
for i in range(self.max_iterations):
for feat, gold in zip(features, label_arrays):
pred = 1 / (1 + np.exp(-(np.dot(feat, self.weights) + self.bias)))
loss = -np.mean(gold * np.log(pred + 1e-8) +
(1 - gold) * np.log(1 - pred + 1e-8))
total_loss += loss
# Compute gradients
d_weights = feat.reshape(-1, 1) * (pred - gold)
d_bias = pred - gold
# Update weights
self.weights -= self.learning_rate * d_weights
self.bias -= self.learning_rate * float(d_bias)
losses[self.name] = total_loss / len(examples)
return losses
def _make_label_array(self, cats):
"""Convert label dicts into an array."""
arr = np.zeros((len(cats),))
for i, cat_dict in enumerate(cats):
if cat_dict.get("positive", 0) > 0.5:
arr[i] = 1.0
return arr.reshape(-1, 1)

View File

@ -1,129 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'cells': [{'cell_type': 'markdown',\n",
" 'metadata': {},\n",
" 'source': ['# Pure Logistic Regression Text Categorizer\\n',\n",
" 'This tutorial demonstrates how to use the custom logistic regression text categorizer.']},\n",
" {'cell_type': 'code',\n",
" 'execution_count': None,\n",
" 'metadata': {},\n",
" 'source': ['import spacy\\n',\n",
" 'from spacy.training import Example\\n',\n",
" '\\n',\n",
" '# Load spaCy model\\n',\n",
" 'nlp = spacy.load(\"en_core_web_lg\")\\n',\n",
" 'nlp.add_pipe(\"pure_logistic_textcat\")\\n',\n",
" '\\n',\n",
" '# Example training data\\n',\n",
" 'TRAIN_DATA = [\\n',\n",
" ' (\"This is amazing!\", {\"cats\": {\"positive\": 1.0, \"negative\": 0.0}}),\\n',\n",
" ' (\"This is terrible!\", {\"cats\": {\"positive\": 0.0, \"negative\": 1.0}})\\n',\n",
" ']\\n',\n",
" '\\n',\n",
" '# Create training examples\\n',\n",
" 'examples = []\\n',\n",
" 'for text, annotations in TRAIN_DATA:\\n',\n",
" ' doc = nlp.make_doc(text)\\n',\n",
" ' example = Example.from_dict(doc, annotations)\\n',\n",
" ' examples.append(example)\\n',\n",
" '\\n',\n",
" '# Train the model\\n',\n",
" 'textcat = nlp.get_pipe(\"pure_logistic_textcat\")\\n',\n",
" 'losses = textcat.update(examples)\\n',\n",
" 'print(f\"Losses: {losses}\")\\n',\n",
" '\\n',\n",
" '# Test the model\\n',\n",
" 'test_text = \"This product is fantastic!\"\\n',\n",
" 'doc = nlp(test_text)\\n',\n",
" 'print(f\"\\\\nText: {test_text}\")\\n',\n",
" 'print(f\"Predictions: {doc.cats}\")']}]}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{\n",
" \"cells\": [\n",
" {\n",
" \"cell_type\": \"markdown\",\n",
" \"metadata\": {},\n",
" \"source\": [\n",
" \"# Pure Logistic Regression Text Categorizer\\n\",\n",
" \"This tutorial demonstrates how to use the custom logistic regression text categorizer.\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": None,\n",
" \"metadata\": {},\n",
" \"source\": [\n",
" \"import spacy\\n\",\n",
" \"from spacy.training import Example\\n\",\n",
" \"\\n\",\n",
" \"# Load spaCy model\\n\",\n",
" \"nlp = spacy.load(\\\"en_core_web_lg\\\")\\n\",\n",
" \"nlp.add_pipe(\\\"pure_logistic_textcat\\\")\\n\",\n",
" \"\\n\",\n",
" \"# Example training data\\n\",\n",
" \"TRAIN_DATA = [\\n\",\n",
" \" (\\\"This is amazing!\\\", {\\\"cats\\\": {\\\"positive\\\": 1.0, \\\"negative\\\": 0.0}}),\\n\",\n",
" \" (\\\"This is terrible!\\\", {\\\"cats\\\": {\\\"positive\\\": 0.0, \\\"negative\\\": 1.0}})\\n\",\n",
" \"]\\n\",\n",
" \"\\n\",\n",
" \"# Create training examples\\n\",\n",
" \"examples = []\\n\",\n",
" \"for text, annotations in TRAIN_DATA:\\n\",\n",
" \" doc = nlp.make_doc(text)\\n\",\n",
" \" example = Example.from_dict(doc, annotations)\\n\",\n",
" \" examples.append(example)\\n\",\n",
" \"\\n\",\n",
" \"# Train the model\\n\",\n",
" \"textcat = nlp.get_pipe(\\\"pure_logistic_textcat\\\")\\n\",\n",
" \"losses = textcat.update(examples)\\n\",\n",
" \"print(f\\\"Losses: {losses}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# Test the model\\n\",\n",
" \"test_text = \\\"This product is fantastic!\\\"\\n\",\n",
" \"doc = nlp(test_text)\\n\",\n",
" \"print(f\\\"\\\\nText: {test_text}\\\")\\n\",\n",
" \"print(f\\\"Predictions: {doc.cats}\\\")\"\n",
" ]\n",
" }\n",
" ]\n",
"}"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}