1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 22 #include <KernelExport.h> 23 #include <Select.h> 24 25 #include <net_buffer.h> 26 #include <net_datalink.h> 27 #include <net_stat.h> 28 #include <NetBufferUtilities.h> 29 #include <NetUtilities.h> 30 31 #include <lock.h> 32 #include <tracing.h> 33 #include <util/AutoLock.h> 34 #include <util/khash.h> 35 #include <util/list.h> 36 37 #include "EndpointManager.h" 38 39 40 // References: 41 // - RFC 793 - Transmission Control Protocol 42 // - RFC 813 - Window and Acknowledgement Strategy in TCP 43 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 44 // 45 // Things this implementation currently doesn't implement: 46 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, 47 // RFC 2001, RFC 2581, RFC 3042 48 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 49 // - Explicit Congestion Notification (ECN), RFC 3168 50 // - SYN-Cache 51 // - TCP Extensions for High Performance, RFC 1323 52 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 53 // - Forward RTO-Recovery, RFC 4138 54 // - Time-Wait hash instead of keeping sockets alive 55 56 #define PrintAddress(address) \ 57 AddressString(Domain(), address, true).Data() 58 59 //#define TRACE_TCP 60 //#define PROBE_TCP 61 62 #ifdef TRACE_TCP 63 // the space before ', ##args' is important in order for this to work with cpp 2.95 64 # define TRACE(format, args...) dprintf("%ld: TCP [%llu] %p (%12s) " \ 65 format "\n", find_thread(NULL), system_time(), this, \ 66 name_for_state(fState) , ##args) 67 #else 68 # define TRACE(args...) do { } while (0) 69 #endif 70 71 #ifdef PROBE_TCP 72 # define PROBE(buffer, window) \ 73 dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \ 74 system_time(), PrintAddress(buffer->source), \ 75 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 76 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 77 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 78 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 79 #else 80 # define PROBE(buffer, window) do { } while (0) 81 #endif 82 83 #if TCP_TRACING 84 namespace TCPTracing { 85 86 class Receive : public AbstractTraceEntry { 87 public: 88 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 89 net_buffer* buffer) 90 : 91 fEndpoint(endpoint), 92 fBuffer(buffer), 93 fBufferSize(buffer->size), 94 fSequence(segment.sequence), 95 fAcknowledge(segment.acknowledge), 96 fWindow(window), 97 fState(endpoint->State()), 98 fFlags(segment.flags) 99 { 100 Initialized(); 101 } 102 103 virtual void AddDump(TraceOutput& out) 104 { 105 out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, " 106 "seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState), 107 fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 108 } 109 110 protected: 111 TCPEndpoint* fEndpoint; 112 net_buffer* fBuffer; 113 uint32 fBufferSize; 114 uint32 fSequence; 115 uint32 fAcknowledge; 116 uint32 fWindow; 117 tcp_state fState; 118 uint8 fFlags; 119 }; 120 121 class Send : public AbstractTraceEntry { 122 public: 123 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 124 tcp_sequence firstSequence, tcp_sequence lastSequence) 125 : 126 fEndpoint(endpoint), 127 fBuffer(buffer), 128 fBufferSize(buffer->size), 129 fSequence(segment.sequence), 130 fAcknowledge(segment.acknowledge), 131 fFirstSequence(firstSequence.Number()), 132 fLastSequence(lastSequence.Number()), 133 fState(endpoint->State()), 134 fFlags(segment.flags) 135 { 136 Initialized(); 137 } 138 139 virtual void AddDump(TraceOutput& out) 140 { 141 out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, " 142 "seq %lu, ack %lu, first %lu, last %lu", 143 fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags, 144 fSequence, fAcknowledge, fFirstSequence, fLastSequence); 145 } 146 147 protected: 148 TCPEndpoint* fEndpoint; 149 net_buffer* fBuffer; 150 uint32 fBufferSize; 151 uint32 fSequence; 152 uint32 fAcknowledge; 153 uint32 fFirstSequence; 154 uint32 fLastSequence; 155 tcp_state fState; 156 uint8 fFlags; 157 }; 158 159 class State : public AbstractTraceEntry { 160 public: 161 State(TCPEndpoint* endpoint) 162 : 163 fEndpoint(endpoint), 164 fState(endpoint->State()) 165 { 166 Initialized(); 167 } 168 169 virtual void AddDump(TraceOutput& out) 170 { 171 out.Print("tcp:%p (%12s) state change", fEndpoint, 172 name_for_state(fState)); 173 } 174 175 protected: 176 TCPEndpoint* fEndpoint; 177 tcp_state fState; 178 }; 179 180 class Spawn : public AbstractTraceEntry { 181 public: 182 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 183 : 184 fListeningEndpoint(listeningEndpoint), 185 fSpawnedEndpoint(spawnedEndpoint) 186 { 187 Initialized(); 188 } 189 190 virtual void AddDump(TraceOutput& out) 191 { 192 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 193 } 194 195 protected: 196 TCPEndpoint* fListeningEndpoint; 197 TCPEndpoint* fSpawnedEndpoint; 198 }; 199 200 class Error : public AbstractTraceEntry { 201 public: 202 Error(TCPEndpoint* endpoint, const char* error, int32 line) 203 : 204 fEndpoint(endpoint), 205 fLine(line), 206 fError(error), 207 fState(endpoint->State()) 208 { 209 Initialized(); 210 } 211 212 virtual void AddDump(TraceOutput& out) 213 { 214 out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint, 215 name_for_state(fState), fLine, fError); 216 } 217 218 protected: 219 TCPEndpoint* fEndpoint; 220 int32 fLine; 221 const char* fError; 222 tcp_state fState; 223 }; 224 225 class Timer : public AbstractTraceEntry { 226 public: 227 Timer(TCPEndpoint* endpoint, const char* which) 228 : 229 fEndpoint(endpoint), 230 fWhich(which), 231 fState(endpoint->State()) 232 { 233 Initialized(); 234 } 235 236 virtual void AddDump(TraceOutput& out) 237 { 238 out.Print("tcp:%p (%12s) %s timer", fEndpoint, 239 name_for_state(fState), fWhich); 240 } 241 242 protected: 243 TCPEndpoint* fEndpoint; 244 const char* fWhich; 245 tcp_state fState; 246 }; 247 248 } // namespace TCPTracing 249 250 # define T(x) new(std::nothrow) TCPTracing::x 251 #else 252 # define T(x) 253 #endif // TCP_TRACING 254 255 // Initial estimate for packet round trip time (RTT) 256 #define TCP_INITIAL_RTT 2000000 257 258 // constants for the fFlags field 259 enum { 260 FLAG_OPTION_WINDOW_SCALE = 0x01, 261 FLAG_OPTION_TIMESTAMP = 0x02, 262 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 263 // That is, what is expected from accept() after a shutdown() 264 // is performed on a listen()ing socket. 265 FLAG_NO_RECEIVE = 0x04, 266 FLAG_CLOSED = 0x08, 267 FLAG_DELETE_ON_CLOSE = 0x10, 268 FLAG_LOCAL = 0x20 269 }; 270 271 272 static const int kTimestampFactor = 1024; 273 274 275 static inline bigtime_t 276 absolute_timeout(bigtime_t timeout) 277 { 278 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 279 return timeout; 280 281 return timeout + system_time(); 282 } 283 284 285 static inline status_t 286 posix_error(status_t error) 287 { 288 if (error == B_TIMED_OUT) 289 return B_WOULD_BLOCK; 290 291 return error; 292 } 293 294 295 static inline bool 296 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 297 uint32 receiveWindow) 298 { 299 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 300 } 301 302 303 static inline bool 304 segment_in_sequence(const tcp_segment_header& segment, int size, 305 const tcp_sequence& receiveNext, uint32 receiveWindow) 306 { 307 tcp_sequence sequence(segment.sequence); 308 if (size == 0) { 309 if (receiveWindow == 0) 310 return sequence == receiveNext; 311 return in_window(sequence, receiveNext, receiveWindow); 312 } else { 313 if (receiveWindow == 0) 314 return false; 315 return in_window(sequence, receiveNext, receiveWindow) 316 || in_window(sequence + size - 1, receiveNext, receiveWindow); 317 } 318 } 319 320 321 static inline bool 322 is_writable(tcp_state state) 323 { 324 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED 325 || state == ESTABLISHED || state == FINISH_RECEIVED; 326 } 327 328 329 static inline uint32 tcp_now() 330 { 331 return system_time() / kTimestampFactor; 332 } 333 334 335 static inline uint32 tcp_diff_timestamp(uint32 base) 336 { 337 uint32 now = tcp_now(); 338 339 if (now > base) 340 return now - base; 341 342 return now + UINT_MAX - base; 343 } 344 345 346 static inline bool 347 state_needs_finish(int32 state) 348 { 349 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 350 || state == FINISH_SENT || state == CLOSING; 351 } 352 353 354 // #pragma mark - 355 356 357 WaitList::WaitList(const char* name) 358 { 359 fCondition = 0; 360 fSem = create_sem(0, name); 361 } 362 363 364 WaitList::~WaitList() 365 { 366 delete_sem(fSem); 367 } 368 369 370 status_t 371 WaitList::InitCheck() const 372 { 373 return fSem; 374 } 375 376 377 status_t 378 WaitList::Wait(MutexLocker& locker, bigtime_t timeout) 379 { 380 locker.Unlock(); 381 382 status_t status = B_OK; 383 384 while (!atomic_test_and_set(&fCondition, 0, 1)) { 385 status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, 386 timeout); 387 if (status != B_OK) 388 break; 389 } 390 391 locker.Lock(); 392 return status; 393 } 394 395 396 void 397 WaitList::Signal() 398 { 399 atomic_or(&fCondition, 1); 400 #ifdef __HAIKU__ 401 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL); 402 #else 403 int32 count; 404 if (get_sem_count(fSem, &count) == B_OK && count < 0) 405 release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE); 406 #endif 407 } 408 409 410 // #pragma mark - 411 412 413 TCPEndpoint::TCPEndpoint(net_socket* socket) 414 : 415 ProtocolSocket(socket), 416 fManager(NULL), 417 fReceiveList("tcp receive"), 418 fSendList("tcp send"), 419 fOptions(0), 420 fSendWindowShift(0), 421 fReceiveWindowShift(0), 422 fSendUnacknowledged(0), 423 fSendNext(0), 424 fSendMax(0), 425 fSendUrgentOffset(0), 426 fSendWindow(0), 427 fSendMaxWindow(0), 428 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 429 fSendQueue(socket->send.buffer_size), 430 fInitialSendSequence(0), 431 fDuplicateAcknowledgeCount(0), 432 fRoute(NULL), 433 fReceiveNext(0), 434 fReceiveMaxAdvertised(0), 435 fReceiveWindow(socket->receive.buffer_size), 436 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 437 fReceiveQueue(socket->receive.buffer_size), 438 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 439 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 440 fRetransmitTimeout(TCP_INITIAL_RTT), 441 fReceivedTimestamp(0), 442 fCongestionWindow(0), 443 fSlowStartThreshold(0), 444 fState(CLOSED), 445 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP) 446 { 447 // TODO: to be replaced with a real read/write locking strategy! 448 mutex_init(&fLock, "tcp lock"); 449 450 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 451 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 452 this); 453 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 454 TCPEndpoint::_DelayedAcknowledgeTimer, this); 455 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 456 this); 457 } 458 459 460 TCPEndpoint::~TCPEndpoint() 461 { 462 mutex_lock(&fLock); 463 464 _CancelConnectionTimers(); 465 gStackModule->cancel_timer(&fTimeWaitTimer); 466 467 if (fManager != NULL) { 468 fManager->Unbind(this); 469 put_endpoint_manager(fManager); 470 } 471 472 mutex_destroy(&fLock); 473 474 // we need to wait for all timers to return 475 gStackModule->wait_for_timer(&fRetransmitTimer); 476 gStackModule->wait_for_timer(&fPersistTimer); 477 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 478 gStackModule->wait_for_timer(&fTimeWaitTimer); 479 480 gDatalinkModule->put_route(Domain(), fRoute); 481 } 482 483 484 status_t 485 TCPEndpoint::InitCheck() const 486 { 487 if (fReceiveList.InitCheck() < B_OK) 488 return fReceiveList.InitCheck(); 489 490 if (fSendList.InitCheck() < B_OK) 491 return fSendList.InitCheck(); 492 493 return B_OK; 494 } 495 496 497 // #pragma mark - protocol API 498 499 500 status_t 501 TCPEndpoint::Open() 502 { 503 TRACE("Open()"); 504 505 status_t status = ProtocolSocket::Open(); 506 if (status < B_OK) 507 return status; 508 509 fManager = get_endpoint_manager(Domain()); 510 if (fManager == NULL) 511 return EAFNOSUPPORT; 512 513 return B_OK; 514 } 515 516 517 status_t 518 TCPEndpoint::Close() 519 { 520 TRACE("Close()"); 521 522 MutexLocker locker(fLock); 523 524 if (fState == LISTEN) 525 delete_sem(fAcceptSemaphore); 526 527 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 528 // TODO: what about linger in case of SYNCHRONIZE_SENT? 529 fState = CLOSED; 530 T(State(this)); 531 return B_OK; 532 } 533 534 status_t status = _Disconnect(true); 535 if (status != B_OK) 536 return status; 537 538 if (socket->options & SO_LINGER) { 539 TRACE("Close(): Lingering for %i secs", socket->linger); 540 541 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 542 543 while (fSendQueue.Used() > 0) { 544 status = fSendList.Wait(locker, maximum); 545 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 546 break; 547 else if (status < B_OK) 548 return status; 549 } 550 551 TRACE("Close(): after waiting, the SendQ was left with %lu bytes.", 552 fSendQueue.Used()); 553 } 554 return B_OK; 555 } 556 557 558 void 559 TCPEndpoint::Free() 560 { 561 TRACE("Free()"); 562 563 MutexLocker _(fLock); 564 565 if (fState <= SYNCHRONIZE_SENT) 566 return; 567 568 // we are only interested in the timer, not in changing state 569 _EnterTimeWait(); 570 571 fFlags |= FLAG_CLOSED; 572 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 573 // we'll be freed later when the 2MSL timer expires 574 gSocketModule->acquire_socket(socket); 575 } 576 } 577 578 579 /*! Creates and sends a synchronize packet to /a address, and then waits 580 until the connection has been established or refused. 581 */ 582 status_t 583 TCPEndpoint::Connect(const sockaddr* address) 584 { 585 TRACE("Connect() on address %s", PrintAddress(address)); 586 587 if (!AddressModule()->is_same_family(address)) 588 return EAFNOSUPPORT; 589 590 MutexLocker locker(fLock); 591 592 if (gStackModule->is_restarted_syscall()) { 593 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 594 status_t status = _WaitForEstablished(locker, timeout); 595 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 596 strerror(status), timeout); 597 return posix_error(status); 598 } 599 600 // Can only call connect() from CLOSED or LISTEN states 601 // otherwise endpoint is considered already connected 602 if (fState == LISTEN) { 603 // this socket is about to connect; remove pending connections in the backlog 604 gSocketModule->set_max_backlog(socket, 0); 605 } else if (fState == ESTABLISHED) { 606 return EISCONN; 607 } else if (fState != CLOSED) 608 return EINPROGRESS; 609 610 // consider destination address INADDR_ANY as INADDR_LOOPBACK 611 sockaddr_storage _address; 612 if (AddressModule()->is_empty_address(address, false)) { 613 AddressModule()->get_loopback_address((sockaddr *)&_address); 614 // for IPv4 and IPv6 the port is at the same offset 615 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 616 address = (sockaddr *)&_address; 617 } 618 619 status_t status = _PrepareSendPath(address); 620 if (status < B_OK) 621 return status; 622 623 TRACE(" Connect(): starting 3-way handshake..."); 624 625 fState = SYNCHRONIZE_SENT; 626 T(State(this)); 627 628 // send SYN 629 status = _SendQueued(); 630 if (status != B_OK) { 631 _Close(); 632 return status; 633 } 634 635 // If we are running over Loopback, after _SendQueued() returns we 636 // may be in ESTABLISHED already. 637 if (fState == ESTABLISHED) { 638 TRACE(" Connect() completed after _SendQueued()"); 639 return B_OK; 640 } 641 642 // wait until 3-way handshake is complete (if needed) 643 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 644 if (timeout == 0) { 645 // we're a non-blocking socket 646 TRACE(" Connect() delayed, return EINPROGRESS"); 647 return EINPROGRESS; 648 } 649 650 bigtime_t absoluteTimeout = absolute_timeout(timeout); 651 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 652 653 status = _WaitForEstablished(locker, absoluteTimeout); 654 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 655 strerror(status), timeout); 656 return posix_error(status); 657 } 658 659 660 status_t 661 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 662 { 663 TRACE("Accept()"); 664 665 MutexLocker locker(fLock); 666 667 status_t status; 668 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 669 if (gStackModule->is_restarted_syscall()) 670 timeout = gStackModule->restore_syscall_restart_timeout(); 671 else 672 gStackModule->store_syscall_restart_timeout(timeout); 673 674 do { 675 locker.Unlock(); 676 677 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 678 | B_CAN_INTERRUPT, timeout); 679 if (status != B_OK) { 680 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 681 return B_WOULD_BLOCK; 682 683 return status; 684 } 685 686 locker.Lock(); 687 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 688 #ifdef TRACE_TCP 689 if (status == B_OK) 690 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 691 #endif 692 } while (status != B_OK); 693 694 return status; 695 } 696 697 698 status_t 699 TCPEndpoint::Bind(const sockaddr *address) 700 { 701 if (address == NULL) 702 return B_BAD_VALUE; 703 704 MutexLocker lock(fLock); 705 706 TRACE("Bind() on address %s", PrintAddress(address)); 707 708 if (fState != CLOSED) 709 return EISCONN; 710 711 return fManager->Bind(this, address); 712 } 713 714 715 status_t 716 TCPEndpoint::Unbind(struct sockaddr *address) 717 { 718 TRACE("Unbind()"); 719 720 MutexLocker _(fLock); 721 return fManager->Unbind(this); 722 } 723 724 725 status_t 726 TCPEndpoint::Listen(int count) 727 { 728 TRACE("Listen()"); 729 730 MutexLocker _(fLock); 731 732 if (fState != CLOSED) 733 return B_BAD_VALUE; 734 735 fAcceptSemaphore = create_sem(0, "tcp accept"); 736 if (fAcceptSemaphore < B_OK) 737 return ENOBUFS; 738 739 status_t status = fManager->SetPassive(this); 740 if (status != B_OK) { 741 delete_sem(fAcceptSemaphore); 742 fAcceptSemaphore = -1; 743 return status; 744 } 745 746 gSocketModule->set_max_backlog(socket, count); 747 748 fState = LISTEN; 749 T(State(this)); 750 return B_OK; 751 } 752 753 754 status_t 755 TCPEndpoint::Shutdown(int direction) 756 { 757 TRACE("Shutdown(%i)", direction); 758 759 MutexLocker lock(fLock); 760 761 if (direction == SHUT_RD || direction == SHUT_RDWR) 762 fFlags |= FLAG_NO_RECEIVE; 763 764 if (direction == SHUT_WR || direction == SHUT_RDWR) { 765 // TODO: That's not correct. After read/write shutting down the socket 766 // one should still be able to read previously arrived data. 767 _Disconnect(false); 768 } 769 770 return B_OK; 771 } 772 773 774 /*! Puts data contained in \a buffer into send buffer */ 775 status_t 776 TCPEndpoint::SendData(net_buffer *buffer) 777 { 778 MutexLocker lock(fLock); 779 780 TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]", 781 buffer, buffer->size, buffer->flags, fSendQueue.Size(), 782 fSendQueue.Free()); 783 784 if (fState == CLOSED) 785 return ENOTCONN; 786 if (fState == LISTEN) 787 return EDESTADDRREQ; 788 if (!is_writable(fState)) { 789 // we only send signals when called from userland 790 if (gStackModule->is_syscall()) 791 send_signal(find_thread(NULL), SIGPIPE); 792 return EPIPE; 793 } 794 795 uint32 flags = buffer->flags; 796 size_t left = buffer->size; 797 798 bigtime_t timeout = absolute_timeout(socket->send.timeout); 799 if (gStackModule->is_restarted_syscall()) 800 timeout = gStackModule->restore_syscall_restart_timeout(); 801 else 802 gStackModule->store_syscall_restart_timeout(timeout); 803 804 while (left > 0) { 805 while (fSendQueue.Free() < socket->send.low_water_mark) { 806 // wait until enough space is available 807 status_t status = fSendList.Wait(lock, timeout); 808 if (status < B_OK) { 809 TRACE(" SendData() returning %s (%d)", 810 strerror(posix_error(status)), (int)posix_error(status)); 811 return posix_error(status); 812 } 813 814 if (!is_writable(fState)) { 815 // we only send signals when called from userland 816 if (gStackModule->is_syscall()) 817 send_signal(find_thread(NULL), SIGPIPE); 818 return EPIPE; 819 } 820 } 821 822 size_t size = fSendQueue.Free(); 823 if (size < left) { 824 // we need to split the original buffer 825 net_buffer* clone = gBufferModule->clone(buffer, false); 826 // TODO: add offset/size parameter to net_buffer::clone() or 827 // even a move_data() function, as this is a bit inefficient 828 if (clone == NULL) 829 return ENOBUFS; 830 831 status_t status = gBufferModule->trim(clone, size); 832 if (status != B_OK) { 833 gBufferModule->free(clone); 834 return status; 835 } 836 837 gBufferModule->remove_header(buffer, size); 838 left -= size; 839 fSendQueue.Add(clone); 840 } else { 841 left -= buffer->size; 842 fSendQueue.Add(buffer); 843 } 844 } 845 846 TRACE(" SendData(): %lu bytes used.", fSendQueue.Used()); 847 848 bool force = false; 849 if ((flags & MSG_OOB) != 0) { 850 fSendUrgentOffset = fSendQueue.LastSequence(); 851 // RFC 961 specifies that the urgent offset points to the last 852 // byte of urgent data. However, this is commonly implemented as 853 // here, ie. it points to the first byte after the urgent data. 854 force = true; 855 } 856 if ((flags & MSG_EOF) != 0) 857 _Disconnect(false); 858 859 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 860 _SendQueued(force); 861 862 return B_OK; 863 } 864 865 866 ssize_t 867 TCPEndpoint::SendAvailable() 868 { 869 MutexLocker locker(fLock); 870 871 ssize_t available; 872 873 if (is_writable(fState)) 874 available = fSendQueue.Free(); 875 else 876 available = EPIPE; 877 878 TRACE("SendAvailable(): %li", available); 879 return available; 880 } 881 882 883 status_t 884 TCPEndpoint::FillStat(net_stat *stat) 885 { 886 MutexLocker _(fLock); 887 888 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 889 stat->receive_queue_size = fReceiveQueue.Available(); 890 stat->send_queue_size = fSendQueue.Used(); 891 892 return B_OK; 893 } 894 895 896 status_t 897 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 898 { 899 TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags); 900 901 MutexLocker locker(fLock); 902 903 *_buffer = NULL; 904 905 if (fState == CLOSED) 906 return ENOTCONN; 907 908 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 909 if (gStackModule->is_restarted_syscall()) 910 timeout = gStackModule->restore_syscall_restart_timeout(); 911 else 912 gStackModule->store_syscall_restart_timeout(timeout); 913 914 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 915 if (flags & MSG_DONTWAIT) 916 return B_WOULD_BLOCK; 917 918 status_t status = _WaitForEstablished(locker, timeout); 919 if (status < B_OK) 920 return posix_error(status); 921 } 922 923 size_t dataNeeded = socket->receive.low_water_mark; 924 925 // When MSG_WAITALL is set then the function should block 926 // until the full amount of data can be returned. 927 if (flags & MSG_WAITALL) 928 dataNeeded = numBytes; 929 930 // TODO: add support for urgent data (MSG_OOB) 931 932 while (true) { 933 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 934 || fState == TIME_WAIT) { 935 // ``Connection closing''. 936 return B_OK; 937 } 938 939 if (fReceiveQueue.Available() > 0) { 940 if (fReceiveQueue.Available() >= dataNeeded 941 || (fReceiveQueue.PushedData() > 0 942 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 943 break; 944 } else if (fState == FINISH_RECEIVED) { 945 // ``If no text is awaiting delivery, the RECEIVE will 946 // get a Connection closing''. 947 return B_OK; 948 } 949 950 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 951 return B_WOULD_BLOCK; 952 953 if ((fFlags & FLAG_NO_RECEIVE) != 0) 954 return B_OK; 955 956 status_t status = fReceiveList.Wait(locker, timeout); 957 if (status < B_OK) { 958 // The Open Group base specification mentions that EINTR should be 959 // returned if the recv() is interrupted before _any data_ is 960 // available. So we actually check if there is data, and if so, 961 // push it to the user. 962 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 963 && fReceiveQueue.Available() > 0) 964 break; 965 966 return posix_error(status); 967 } 968 } 969 970 TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available()); 971 972 if (numBytes < fReceiveQueue.Available()) 973 fReceiveList.Signal(); 974 975 bool clone = (flags & MSG_PEEK) != 0; 976 977 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 978 979 TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available()); 980 981 // if we are opening the window, check if we should send an ACK 982 if (!clone) 983 SendAcknowledge(false); 984 985 return receivedBytes; 986 } 987 988 989 ssize_t 990 TCPEndpoint::ReadAvailable() 991 { 992 MutexLocker locker(fLock); 993 994 TRACE("ReadAvailable(): %li", _AvailableData()); 995 996 return _AvailableData(); 997 } 998 999 1000 status_t 1001 TCPEndpoint::SetSendBufferSize(size_t length) 1002 { 1003 MutexLocker _(fLock); 1004 fSendQueue.SetMaxBytes(length); 1005 return B_OK; 1006 } 1007 1008 1009 status_t 1010 TCPEndpoint::SetReceiveBufferSize(size_t length) 1011 { 1012 MutexLocker _(fLock); 1013 fReceiveQueue.SetMaxBytes(length); 1014 return B_OK; 1015 } 1016 1017 1018 status_t 1019 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1020 { 1021 if (*_length != sizeof(int)) 1022 return B_BAD_VALUE; 1023 1024 int* value = (int*)_value; 1025 1026 switch (option) { 1027 case TCP_NODELAY: 1028 if ((fOptions & TCP_NODELAY) != 0) 1029 *value = 1; 1030 else 1031 *value = 0; 1032 return B_OK; 1033 1034 case TCP_MAXSEG: 1035 *value = fReceiveMaxSegmentSize; 1036 return B_OK; 1037 1038 default: 1039 return B_BAD_VALUE; 1040 } 1041 } 1042 1043 1044 status_t 1045 TCPEndpoint::SetOption(int option, const void* _value, int length) 1046 { 1047 if (option != TCP_NODELAY) 1048 return B_BAD_VALUE; 1049 1050 if (length != sizeof(int)) 1051 return B_BAD_VALUE; 1052 1053 const int* value = (const int*)_value; 1054 1055 MutexLocker _(fLock); 1056 if (*value) 1057 fOptions |= TCP_NODELAY; 1058 else 1059 fOptions &= ~TCP_NODELAY; 1060 1061 return B_OK; 1062 } 1063 1064 1065 // #pragma mark - misc 1066 1067 1068 bool 1069 TCPEndpoint::IsBound() const 1070 { 1071 return !LocalAddress().IsEmpty(true); 1072 } 1073 1074 1075 bool 1076 TCPEndpoint::IsLocal() const 1077 { 1078 return (fFlags & FLAG_LOCAL) != 0; 1079 } 1080 1081 1082 status_t 1083 TCPEndpoint::DelayedAcknowledge() 1084 { 1085 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 1086 // timer was active, send an ACK now (with the exception above, 1087 // we send every other ACK) 1088 return SendAcknowledge(true); 1089 } 1090 1091 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 1092 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 1093 return B_OK; 1094 } 1095 1096 1097 status_t 1098 TCPEndpoint::SendAcknowledge(bool force) 1099 { 1100 return _SendQueued(force, 0); 1101 } 1102 1103 1104 void 1105 TCPEndpoint::_StartPersistTimer() 1106 { 1107 gStackModule->set_timer(&fPersistTimer, 1000000LL); 1108 } 1109 1110 1111 void 1112 TCPEndpoint::_EnterTimeWait() 1113 { 1114 TRACE("_EnterTimeWait()\n"); 1115 1116 _CancelConnectionTimers(); 1117 1118 if (fState == TIME_WAIT && IsLocal()) { 1119 // we do not use TIME_WAIT state for local connections 1120 fFlags |= FLAG_DELETE_ON_CLOSE; 1121 return; 1122 } 1123 1124 _UpdateTimeWait(); 1125 } 1126 1127 1128 void 1129 TCPEndpoint::_UpdateTimeWait() 1130 { 1131 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1132 } 1133 1134 1135 void 1136 TCPEndpoint::_CancelConnectionTimers() 1137 { 1138 gStackModule->cancel_timer(&fRetransmitTimer); 1139 gStackModule->cancel_timer(&fPersistTimer); 1140 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1141 } 1142 1143 1144 /*! Sends the FIN flag to the peer when the connection is still open. 1145 Moves the endpoint to the next state depending on where it was. 1146 */ 1147 status_t 1148 TCPEndpoint::_Disconnect(bool closing) 1149 { 1150 tcp_state previousState = fState; 1151 1152 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1153 fState = FINISH_SENT; 1154 else if (fState == FINISH_RECEIVED) 1155 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1156 else 1157 return B_OK; 1158 1159 T(State(this)); 1160 1161 status_t status = _SendQueued(); 1162 if (status != B_OK) { 1163 fState = previousState; 1164 T(State(this)); 1165 return status; 1166 } 1167 1168 return B_OK; 1169 } 1170 1171 1172 void 1173 TCPEndpoint::_MarkEstablished() 1174 { 1175 fState = ESTABLISHED; 1176 T(State(this)); 1177 1178 if (gSocketModule->has_parent(socket)) { 1179 gSocketModule->set_connected(socket); 1180 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1181 } 1182 1183 fSendList.Signal(); 1184 } 1185 1186 1187 status_t 1188 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1189 { 1190 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1191 // When investigating this, also have a look at _Close() and _HandleReset(). 1192 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1193 if (socket->error != B_OK) 1194 return socket->error; 1195 1196 status_t status = fSendList.Wait(locker, timeout); 1197 if (status < B_OK) 1198 return status; 1199 } 1200 1201 return B_OK; 1202 } 1203 1204 1205 // #pragma mark - receive 1206 1207 1208 void 1209 TCPEndpoint::_Close() 1210 { 1211 _CancelConnectionTimers(); 1212 fState = CLOSED; 1213 T(State(this)); 1214 1215 fFlags |= FLAG_DELETE_ON_CLOSE; 1216 1217 fSendList.Signal(); 1218 _NotifyReader(); 1219 1220 if (gSocketModule->has_parent(socket)) { 1221 // We still have a parent - obviously, we haven't been accepted yet, 1222 // so no one could ever close us. 1223 _CancelConnectionTimers(); 1224 gSocketModule->set_aborted(socket); 1225 } 1226 } 1227 1228 1229 void 1230 TCPEndpoint::_HandleReset(status_t error) 1231 { 1232 socket->error = error; 1233 _Close(); 1234 1235 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1236 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1237 } 1238 1239 1240 void 1241 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1242 { 1243 if (++fDuplicateAcknowledgeCount < 3) 1244 return; 1245 1246 if (fDuplicateAcknowledgeCount == 3) { 1247 _ResetSlowStart(); 1248 fCongestionWindow = fSlowStartThreshold + 3 1249 * fSendMaxSegmentSize; 1250 fSendNext = segment.acknowledge; 1251 } else if (fDuplicateAcknowledgeCount > 3) 1252 fCongestionWindow += fSendMaxSegmentSize; 1253 1254 _SendQueued(); 1255 } 1256 1257 1258 void 1259 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1260 size_t segmentLength) 1261 { 1262 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1263 tcp_sequence sequence(segment.sequence); 1264 1265 if (fLastAcknowledgeSent >= sequence 1266 && fLastAcknowledgeSent < (sequence + segmentLength)) 1267 fReceivedTimestamp = segment.timestamp_value; 1268 } 1269 } 1270 1271 1272 ssize_t 1273 TCPEndpoint::_AvailableData() const 1274 { 1275 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1276 // the application of FLAG_NO_RECEIVE in listen()ing 1277 // sockets. 1278 if (fState == LISTEN) 1279 return gSocketModule->count_connected(socket); 1280 if (fState == SYNCHRONIZE_SENT) 1281 return 0; 1282 1283 ssize_t availableData = fReceiveQueue.Available(); 1284 1285 if (availableData == 0 && !_ShouldReceive()) 1286 return ENOTCONN; 1287 1288 return availableData; 1289 } 1290 1291 1292 void 1293 TCPEndpoint::_NotifyReader() 1294 { 1295 fReceiveList.Signal(); 1296 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1297 } 1298 1299 1300 bool 1301 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1302 { 1303 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1304 // Remember the position of the finish received flag 1305 fFinishReceived = true; 1306 fFinishReceivedAt = segment.sequence + buffer->size; 1307 } 1308 1309 fReceiveQueue.Add(buffer, segment.sequence); 1310 fReceiveNext = fReceiveQueue.NextSequence(); 1311 1312 if (fFinishReceived) { 1313 // Set or reset the finish flag on the current segment 1314 if (fReceiveNext < fFinishReceivedAt) 1315 segment.flags &= ~TCP_FLAG_FINISH; 1316 else 1317 segment.flags |= TCP_FLAG_FINISH; 1318 } 1319 1320 TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu bytes.", 1321 fReceiveNext.Number(), fReceiveQueue.Available()); 1322 1323 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1324 fReceiveQueue.SetPushPointer(); 1325 1326 return fReceiveQueue.Available() > 0; 1327 } 1328 1329 1330 void 1331 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1332 { 1333 fInitialReceiveSequence = segment.sequence; 1334 fFinishReceived = false; 1335 1336 // count the received SYN 1337 segment.sequence++; 1338 1339 fReceiveNext = segment.sequence; 1340 fReceiveQueue.SetInitialSequence(segment.sequence); 1341 1342 if ((fOptions & TCP_NOOPT) == 0) { 1343 if (segment.max_segment_size > 0) 1344 fSendMaxSegmentSize = segment.max_segment_size; 1345 1346 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1347 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1348 fSendWindowShift = segment.window_shift; 1349 } else { 1350 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1351 fReceiveWindowShift = 0; 1352 } 1353 1354 if (segment.options & TCP_HAS_TIMESTAMPS) { 1355 fFlags |= FLAG_OPTION_TIMESTAMP; 1356 fReceivedTimestamp = segment.timestamp_value; 1357 } else 1358 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1359 } 1360 1361 fCongestionWindow = 2 * fSendMaxSegmentSize; 1362 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1363 } 1364 1365 1366 bool 1367 TCPEndpoint::_ShouldReceive() const 1368 { 1369 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1370 return false; 1371 1372 return fState == ESTABLISHED || fState == FINISH_SENT 1373 || fState == FINISH_ACKNOWLEDGED; 1374 } 1375 1376 1377 int32 1378 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1379 net_buffer* buffer) 1380 { 1381 MutexLocker _(fLock); 1382 1383 // TODO error checking 1384 ProtocolSocket::Open(); 1385 1386 fState = SYNCHRONIZE_RECEIVED; 1387 T(Spawn(parent, this)); 1388 1389 fManager = parent->fManager; 1390 1391 LocalAddress().SetTo(buffer->destination); 1392 PeerAddress().SetTo(buffer->source); 1393 1394 TRACE("Spawn()"); 1395 1396 // TODO: proper error handling! 1397 if (fManager->BindChild(this) != B_OK) { 1398 T(Error(this, "binding failed", __LINE__)); 1399 return DROP; 1400 } 1401 if (_PrepareSendPath(*PeerAddress()) != B_OK) { 1402 T(Error(this, "prepare send faild", __LINE__)); 1403 return DROP; 1404 } 1405 1406 fOptions = parent->fOptions; 1407 fAcceptSemaphore = parent->fAcceptSemaphore; 1408 1409 _PrepareReceivePath(segment); 1410 1411 // send SYN+ACK 1412 if (_SendQueued() != B_OK) { 1413 T(Error(this, "sending failed", __LINE__)); 1414 return DROP; 1415 } 1416 1417 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1418 // we handled this flag now, it must not be set for further processing 1419 1420 return _Receive(segment, buffer); 1421 } 1422 1423 1424 int32 1425 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1426 { 1427 TRACE("ListenReceive()"); 1428 1429 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1430 // but the error behaviour differs 1431 if (segment.flags & TCP_FLAG_RESET) 1432 return DROP; 1433 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1434 return DROP | RESET; 1435 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1436 return DROP; 1437 1438 // TODO: drop broadcast/multicast 1439 1440 // spawn new endpoint for accept() 1441 net_socket* newSocket; 1442 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1443 T(Error(this, "spawning failed", __LINE__)); 1444 return DROP; 1445 } 1446 1447 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1448 segment, buffer); 1449 } 1450 1451 1452 int32 1453 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1454 net_buffer *buffer) 1455 { 1456 TRACE("_SynchronizeSentReceive()"); 1457 1458 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1459 && (fInitialSendSequence >= segment.acknowledge 1460 || fSendMax < segment.acknowledge)) 1461 return DROP | RESET; 1462 1463 if (segment.flags & TCP_FLAG_RESET) { 1464 _HandleReset(ECONNREFUSED); 1465 return DROP; 1466 } 1467 1468 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1469 return DROP; 1470 1471 fSendUnacknowledged = segment.acknowledge; 1472 _PrepareReceivePath(segment); 1473 1474 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1475 _MarkEstablished(); 1476 } else { 1477 // simultaneous open 1478 fState = SYNCHRONIZE_RECEIVED; 1479 T(State(this)); 1480 } 1481 1482 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1483 // we handled this flag now, it must not be set for further processing 1484 1485 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1486 } 1487 1488 1489 int32 1490 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1491 { 1492 uint32 advertisedWindow = (uint32)segment.advertised_window 1493 << fSendWindowShift; 1494 size_t segmentLength = buffer->size; 1495 1496 // First, handle the most common case for uni-directional data transfer 1497 // (known as header prediction - the segment must not change the window, 1498 // and must be the expected sequence, and contain no control flags) 1499 1500 if (fState == ESTABLISHED 1501 && segment.AcknowledgeOnly() 1502 && fReceiveNext == segment.sequence 1503 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1504 && fSendNext == fSendMax) { 1505 _UpdateTimestamps(segment, segmentLength); 1506 1507 if (segmentLength == 0) { 1508 // this is a pure acknowledge segment - we're on the sending end 1509 if (fSendUnacknowledged < segment.acknowledge 1510 && fSendMax >= segment.acknowledge) { 1511 _Acknowledged(segment); 1512 return DROP; 1513 } 1514 } else if (segment.acknowledge == fSendUnacknowledged 1515 && fReceiveQueue.IsContiguous() 1516 && fReceiveQueue.Free() >= segmentLength 1517 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1518 if (_AddData(segment, buffer)) 1519 _NotifyReader(); 1520 1521 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1522 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1523 } 1524 } 1525 1526 // The fast path was not applicable, so we continue with the standard 1527 // processing of the incoming segment 1528 1529 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1530 1531 if (fState != CLOSED && fState != TIME_WAIT) { 1532 // Check sequence number 1533 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1534 fReceiveWindow)) { 1535 TRACE(" Receive(): segment out of window, next: %lu wnd: %lu", 1536 fReceiveNext.Number(), fReceiveWindow); 1537 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1538 // TODO: this doesn't look right - review! 1539 return DROP; 1540 } 1541 return DROP | IMMEDIATE_ACKNOWLEDGE; 1542 } 1543 } 1544 1545 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1546 // Is this a valid reset? 1547 // We generally ignore resets in time wait state (see RFC 1337) 1548 if (fLastAcknowledgeSent <= segment.sequence 1549 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1550 + fReceiveWindow) 1551 && fState != TIME_WAIT) { 1552 status_t error; 1553 if (fState == SYNCHRONIZE_RECEIVED) 1554 error = ECONNREFUSED; 1555 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1556 error = ENOTCONN; 1557 else 1558 error = ECONNRESET; 1559 1560 _HandleReset(error); 1561 } 1562 1563 return DROP; 1564 } 1565 1566 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1567 || (fState == SYNCHRONIZE_RECEIVED 1568 && (fInitialReceiveSequence > segment.sequence 1569 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1570 && (fSendUnacknowledged > segment.acknowledge 1571 || fSendMax < segment.acknowledge))))) { 1572 // reset the connection - either the initial SYN was faulty, or we 1573 // received a SYN within the data stream 1574 return DROP | RESET; 1575 } 1576 1577 // TODO: Check this! Why do we advertize a window outside of what we should 1578 // buffer? 1579 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1580 // the window must not shrink 1581 1582 // trim buffer to be within the receive window 1583 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1584 if (drop > 0) { 1585 if ((uint32)drop > buffer->size 1586 || ((uint32)drop == buffer->size 1587 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1588 // don't accidently remove a FIN we shouldn't remove 1589 segment.flags &= ~TCP_FLAG_FINISH; 1590 drop = buffer->size; 1591 } 1592 1593 // remove duplicate data at the start 1594 TRACE("* remove %ld bytes from the start", drop); 1595 gBufferModule->remove_header(buffer, drop); 1596 segment.sequence += drop; 1597 } 1598 1599 int32 action = KEEP; 1600 1601 drop = (int32)(segment.sequence + buffer->size 1602 - (fReceiveNext + fReceiveWindow)).Number(); 1603 if (drop > 0) { 1604 // remove data exceeding our window 1605 if ((uint32)drop >= buffer->size) { 1606 // if we can accept data, or the segment is not what we'd expect, 1607 // drop the segment (an immediate acknowledge is always triggered) 1608 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1609 return DROP | IMMEDIATE_ACKNOWLEDGE; 1610 1611 action |= IMMEDIATE_ACKNOWLEDGE; 1612 } 1613 1614 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1615 // we need to remove the finish, too, as part of the data 1616 drop--; 1617 } 1618 1619 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1620 TRACE("* remove %ld bytes from the end", drop); 1621 gBufferModule->remove_trailer(buffer, drop); 1622 } 1623 1624 #ifdef TRACE_TCP 1625 if (advertisedWindow > fSendWindow) { 1626 TRACE(" Receive(): Window update %lu -> %lu", fSendWindow, 1627 advertisedWindow); 1628 } 1629 #endif 1630 1631 fSendWindow = advertisedWindow; 1632 if (advertisedWindow > fSendMaxWindow) 1633 fSendMaxWindow = advertisedWindow; 1634 1635 // Then look at the acknowledgement for any updates 1636 1637 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1638 // process acknowledged data 1639 if (fState == SYNCHRONIZE_RECEIVED) 1640 _MarkEstablished(); 1641 1642 if (fSendMax < segment.acknowledge) 1643 return DROP | IMMEDIATE_ACKNOWLEDGE; 1644 1645 if (segment.acknowledge < fSendUnacknowledged) { 1646 if (buffer->size == 0 && advertisedWindow == fSendWindow 1647 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1648 TRACE("Receive(): duplicate ack!"); 1649 1650 _DuplicateAcknowledge(segment); 1651 } 1652 1653 return DROP; 1654 } else { 1655 // this segment acknowledges in flight data 1656 1657 if (fDuplicateAcknowledgeCount >= 3) { 1658 // deflate the window. 1659 fCongestionWindow = fSlowStartThreshold; 1660 } 1661 1662 fDuplicateAcknowledgeCount = 0; 1663 1664 if (fSendMax == segment.acknowledge) 1665 TRACE("Receive(): all inflight data ack'd!"); 1666 1667 if (segment.acknowledge > fSendQueue.LastSequence() 1668 && fState > ESTABLISHED) { 1669 TRACE("Receive(): FIN has been acknowledged!"); 1670 1671 switch (fState) { 1672 case FINISH_SENT: 1673 fState = FINISH_ACKNOWLEDGED; 1674 T(State(this)); 1675 break; 1676 case CLOSING: 1677 fState = TIME_WAIT; 1678 T(State(this)); 1679 _EnterTimeWait(); 1680 return DROP; 1681 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1682 _Close(); 1683 break; 1684 1685 default: 1686 break; 1687 } 1688 } 1689 1690 if (fState != CLOSED) 1691 _Acknowledged(segment); 1692 } 1693 } 1694 1695 if (segment.flags & TCP_FLAG_URGENT) { 1696 if (fState == ESTABLISHED || fState == FINISH_SENT 1697 || fState == FINISH_ACKNOWLEDGED) { 1698 // TODO: Handle urgent data: 1699 // - RCV.UP <- max(RCV.UP, SEG.UP) 1700 // - signal the user that urgent data is available (SIGURG) 1701 } 1702 } 1703 1704 bool notify = false; 1705 1706 if ((buffer->size > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1707 && _ShouldReceive()) 1708 notify = _AddData(segment, buffer); 1709 else { 1710 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1711 fReceiveNext += buffer->size; 1712 1713 action = (action & ~KEEP) | DROP; 1714 } 1715 1716 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1717 segmentLength++; 1718 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1719 TRACE("Receive(): peer is finishing connection!"); 1720 fReceiveNext++; 1721 notify = true; 1722 1723 // FIN implies PUSH 1724 fReceiveQueue.SetPushPointer(); 1725 1726 // we'll reply immediately to the FIN if we are not 1727 // transitioning to TIME WAIT so we immediatly ACK it. 1728 action |= IMMEDIATE_ACKNOWLEDGE; 1729 1730 // other side is closing connection; change states 1731 switch (fState) { 1732 case ESTABLISHED: 1733 case SYNCHRONIZE_RECEIVED: 1734 fState = FINISH_RECEIVED; 1735 T(State(this)); 1736 break; 1737 case FINISH_SENT: 1738 // simultaneous close 1739 fState = CLOSING; 1740 T(State(this)); 1741 break; 1742 case FINISH_ACKNOWLEDGED: 1743 fState = TIME_WAIT; 1744 T(State(this)); 1745 _EnterTimeWait(); 1746 break; 1747 case TIME_WAIT: 1748 _UpdateTimeWait(); 1749 break; 1750 1751 default: 1752 break; 1753 } 1754 } 1755 } 1756 1757 if (notify) 1758 _NotifyReader(); 1759 1760 if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1761 action |= ACKNOWLEDGE; 1762 1763 _UpdateTimestamps(segment, segmentLength); 1764 1765 TRACE("Receive() Action %ld", action); 1766 1767 return action; 1768 } 1769 1770 1771 int32 1772 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1773 { 1774 MutexLocker locker(fLock); 1775 1776 TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n" 1777 "\tflags 0x%x, seq %lu, ack %lu, wnd %lu", 1778 buffer, buffer->size, PrintAddress(buffer->source), 1779 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1780 segment.acknowledge, 1781 (uint32)segment.advertised_window << fSendWindowShift); 1782 T(Receive(this, segment, 1783 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1784 int32 segmentAction = DROP; 1785 1786 switch (fState) { 1787 case LISTEN: 1788 segmentAction = _ListenReceive(segment, buffer); 1789 break; 1790 1791 case SYNCHRONIZE_SENT: 1792 segmentAction = _SynchronizeSentReceive(segment, buffer); 1793 break; 1794 1795 case SYNCHRONIZE_RECEIVED: 1796 case ESTABLISHED: 1797 case FINISH_RECEIVED: 1798 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1799 case FINISH_SENT: 1800 case FINISH_ACKNOWLEDGED: 1801 case CLOSING: 1802 case TIME_WAIT: 1803 case CLOSED: 1804 segmentAction = _Receive(segment, buffer); 1805 break; 1806 } 1807 1808 // process acknowledge action as asked for by the *Receive() method 1809 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1810 SendAcknowledge(true); 1811 else if (segmentAction & ACKNOWLEDGE) 1812 DelayedAcknowledge(); 1813 1814 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1815 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1816 locker.Unlock(); 1817 gSocketModule->release_socket(socket); 1818 } 1819 1820 return segmentAction; 1821 } 1822 1823 1824 // #pragma mark - send 1825 1826 1827 inline uint8 1828 TCPEndpoint::_CurrentFlags() 1829 { 1830 // we don't set FLAG_FINISH here, instead we do it 1831 // conditionally below depending if we are sending 1832 // the last bytes of the send queue. 1833 1834 switch (fState) { 1835 case CLOSED: 1836 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1837 1838 case SYNCHRONIZE_SENT: 1839 return TCP_FLAG_SYNCHRONIZE; 1840 case SYNCHRONIZE_RECEIVED: 1841 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1842 1843 case ESTABLISHED: 1844 case FINISH_RECEIVED: 1845 case FINISH_ACKNOWLEDGED: 1846 case TIME_WAIT: 1847 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1848 case FINISH_SENT: 1849 case CLOSING: 1850 return TCP_FLAG_ACKNOWLEDGE; 1851 1852 default: 1853 return 0; 1854 } 1855 } 1856 1857 1858 inline bool 1859 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 1860 uint32 segmentMaxSize, uint32 flightSize) 1861 { 1862 if (length > 0) { 1863 // Avoid the silly window syndrome - we only send a segment in case: 1864 // - we have a full segment to send, or 1865 // - we're at the end of our buffer queue, or 1866 // - the buffer is at least larger than half of the maximum send window, 1867 // or 1868 // - we're retransmitting data 1869 if (length == segmentMaxSize 1870 || (fOptions & TCP_NODELAY) != 0 1871 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1872 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1873 return true; 1874 } 1875 1876 // check if we need to send a window update to the peer 1877 if (segment.advertised_window > 0) { 1878 // correct the window to take into account what already has been advertised 1879 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1880 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 1881 1882 // if we can advertise a window larger than twice the maximum segment 1883 // size, or half the maximum buffer size we send a window update 1884 if (window >= (fReceiveMaxSegmentSize << 1) 1885 || window >= (socket->receive.buffer_size >> 1)) 1886 return true; 1887 } 1888 1889 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1890 | TCP_FLAG_RESET)) != 0) 1891 return true; 1892 1893 // We do have urgent data pending 1894 if (fSendUrgentOffset > fSendNext) 1895 return true; 1896 1897 // there is no reason to send a segment just now 1898 return false; 1899 } 1900 1901 1902 status_t 1903 TCPEndpoint::_SendQueued(bool force) 1904 { 1905 return _SendQueued(force, fSendWindow); 1906 } 1907 1908 1909 /*! Sends one or more TCP segments with the data waiting in the queue, or some 1910 specific flags that need to be sent. 1911 */ 1912 status_t 1913 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1914 { 1915 if (fRoute == NULL) 1916 return B_ERROR; 1917 1918 // in passive state? 1919 if (fState == LISTEN) 1920 return B_ERROR; 1921 1922 tcp_segment_header segment(_CurrentFlags()); 1923 1924 if ((fOptions & TCP_NOOPT) == 0) { 1925 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 1926 segment.options |= TCP_HAS_TIMESTAMPS; 1927 segment.timestamp_reply = fReceivedTimestamp; 1928 segment.timestamp_value = tcp_now(); 1929 } 1930 1931 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1932 && fSendNext == fInitialSendSequence) { 1933 // add connection establishment options 1934 segment.max_segment_size = fReceiveMaxSegmentSize; 1935 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1936 segment.options |= TCP_HAS_WINDOW_SCALE; 1937 segment.window_shift = fReceiveWindowShift; 1938 } 1939 } 1940 } 1941 1942 size_t availableBytes = fReceiveQueue.Free(); 1943 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1944 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1945 else 1946 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1947 1948 segment.acknowledge = fReceiveNext.Number(); 1949 1950 // Process urgent data 1951 if (fSendUrgentOffset > fSendNext) { 1952 segment.flags |= TCP_FLAG_URGENT; 1953 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 1954 } else { 1955 fSendUrgentOffset = fSendUnacknowledged.Number(); 1956 // Keep urgent offset updated, so that it doesn't reach into our 1957 // send window on overlap 1958 segment.urgent_offset = 0; 1959 } 1960 1961 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1962 sendWindow = fCongestionWindow; 1963 1964 // fSendUnacknowledged 1965 // | fSendNext fSendMax 1966 // | | | 1967 // v v v 1968 // ----------------------------------- 1969 // | effective window | 1970 // ----------------------------------- 1971 1972 // Flight size represents the window of data which is currently in the 1973 // ether. We should never send data such as the flight size becomes larger 1974 // than the effective window. Note however that the effective window may be 1975 // reduced (by congestion for instance), so at some point in time flight 1976 // size may be larger than the currently calculated window. 1977 1978 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1979 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 1980 1981 if (consumedWindow > sendWindow) { 1982 sendWindow = 0; 1983 // TODO: enter persist state? try to get a window update. 1984 } else 1985 sendWindow -= consumedWindow; 1986 1987 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 1988 // send one byte of data to ask for a window update 1989 // (triggered by the persist timer) 1990 sendWindow = 1; 1991 } 1992 1993 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 1994 tcp_sequence previousSendNext = fSendNext; 1995 1996 do { 1997 uint32 segmentMaxSize = fSendMaxSegmentSize 1998 - tcp_options_length(segment); 1999 uint32 segmentLength = min_c(length, segmentMaxSize); 2000 2001 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 2002 if (state_needs_finish(fState)) 2003 segment.flags |= TCP_FLAG_FINISH; 2004 if (length > 0) 2005 segment.flags |= TCP_FLAG_PUSH; 2006 } 2007 2008 // Determine if we should really send this segment 2009 if (!force && !_ShouldSendSegment(segment, segmentLength, 2010 segmentMaxSize, flightSize)) { 2011 if (fSendQueue.Available() 2012 && !gStackModule->is_timer_active(&fPersistTimer) 2013 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2014 _StartPersistTimer(); 2015 break; 2016 } 2017 2018 net_buffer *buffer = gBufferModule->create(256); 2019 if (buffer == NULL) 2020 return B_NO_MEMORY; 2021 2022 status_t status = B_OK; 2023 if (segmentLength > 0) 2024 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2025 if (status < B_OK) { 2026 gBufferModule->free(buffer); 2027 return status; 2028 } 2029 2030 LocalAddress().CopyTo(buffer->source); 2031 PeerAddress().CopyTo(buffer->destination); 2032 2033 uint32 size = buffer->size; 2034 segment.sequence = fSendNext.Number(); 2035 2036 TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n" 2037 "\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n" 2038 "\tlen %lu first %lu last %lu", 2039 buffer, buffer->size, PrintAddress(buffer->source), 2040 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2041 segment.acknowledge, segment.advertised_window, 2042 fCongestionWindow, fSlowStartThreshold, segmentLength, 2043 fSendQueue.FirstSequence().Number(), 2044 fSendQueue.LastSequence().Number()); 2045 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2046 fSendQueue.LastSequence())); 2047 2048 PROBE(buffer, sendWindow); 2049 sendWindow -= buffer->size; 2050 2051 status = add_tcp_header(AddressModule(), segment, buffer); 2052 if (status != B_OK) { 2053 gBufferModule->free(buffer); 2054 return status; 2055 } 2056 2057 // Update send status - we need to do this before we send the data 2058 // for local connections as the answer is directly handled 2059 2060 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2061 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2062 segment.max_segment_size = 0; 2063 size++; 2064 } 2065 2066 if (segment.flags & TCP_FLAG_FINISH) 2067 size++; 2068 2069 uint32 sendMax = fSendMax.Number(); 2070 fSendNext += size; 2071 if (fSendMax < fSendNext) 2072 fSendMax = fSendNext; 2073 2074 fReceiveMaxAdvertised = fReceiveNext 2075 + ((uint32)segment.advertised_window << fReceiveWindowShift); 2076 2077 status = next->module->send_routed_data(next, fRoute, buffer); 2078 if (status < B_OK) { 2079 gBufferModule->free(buffer); 2080 2081 fSendNext = segment.sequence; 2082 fSendMax = sendMax; 2083 // restore send status 2084 return status; 2085 } 2086 2087 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 2088 fLastAcknowledgeSent = segment.acknowledge; 2089 2090 length -= segmentLength; 2091 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2092 | TCP_FLAG_FINISH); 2093 } while (length > 0); 2094 2095 // if we sent data from the beggining of the send queue, 2096 // start the retransmition timer 2097 if (previousSendNext == fSendUnacknowledged 2098 && fSendNext > previousSendNext) { 2099 TRACE(" SendQueue(): set retransmit timer with rto %llu", 2100 fRetransmitTimeout); 2101 2102 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2103 } 2104 2105 return B_OK; 2106 } 2107 2108 2109 int 2110 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2111 { 2112 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2113 } 2114 2115 2116 status_t 2117 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2118 { 2119 if (fRoute == NULL) { 2120 fRoute = gDatalinkModule->get_route(Domain(), peer); 2121 if (fRoute == NULL) 2122 return ENETUNREACH; 2123 2124 if ((fRoute->flags & RTF_LOCAL) != 0) 2125 fFlags |= FLAG_LOCAL; 2126 } 2127 2128 // make sure connection does not already exist 2129 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2130 fRoute->interface_address->local); 2131 if (status < B_OK) 2132 return status; 2133 2134 fInitialSendSequence = system_time() >> 4; 2135 fSendNext = fInitialSendSequence; 2136 fSendUnacknowledged = fInitialSendSequence; 2137 fSendMax = fInitialSendSequence; 2138 fSendUrgentOffset = fInitialSendSequence; 2139 2140 // we are counting the SYN here 2141 fSendQueue.SetInitialSequence(fSendNext + 1); 2142 2143 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2144 2145 // Compute the window shift we advertise to our peer - if it doesn't support 2146 // this option, this will be reset to 0 (when its SYN is received) 2147 fReceiveWindowShift = 0; 2148 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2149 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2150 fReceiveWindowShift++; 2151 } 2152 2153 return B_OK; 2154 } 2155 2156 2157 void 2158 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2159 { 2160 size_t previouslyUsed = fSendQueue.Used(); 2161 2162 fSendQueue.RemoveUntil(segment.acknowledge); 2163 fSendUnacknowledged = segment.acknowledge; 2164 2165 if (fSendNext < fSendUnacknowledged) 2166 fSendNext = fSendUnacknowledged; 2167 2168 if (fSendUnacknowledged == fSendMax) 2169 gStackModule->cancel_timer(&fRetransmitTimer); 2170 2171 if (fSendQueue.Used() < previouslyUsed) { 2172 // this ACK acknowledged data 2173 2174 if (segment.options & TCP_HAS_TIMESTAMPS) 2175 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply)); 2176 else { 2177 // TODO: Fallback to RFC 793 type estimation 2178 } 2179 2180 if (is_writable(fState)) { 2181 // notify threads waiting on the socket to become writable again 2182 fSendList.Signal(); 2183 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used()); 2184 } 2185 2186 if (fCongestionWindow < fSlowStartThreshold) 2187 fCongestionWindow += fSendMaxSegmentSize; 2188 } 2189 2190 if (fCongestionWindow >= fSlowStartThreshold) { 2191 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2192 2193 if (increment < fCongestionWindow) 2194 increment = 1; 2195 else 2196 increment /= fCongestionWindow; 2197 2198 fCongestionWindow += increment; 2199 } 2200 2201 // if there is data left to be send, send it now 2202 if (fSendQueue.Used() > 0) 2203 _SendQueued(); 2204 } 2205 2206 2207 void 2208 TCPEndpoint::_Retransmit() 2209 { 2210 TRACE("Retransmit()"); 2211 _ResetSlowStart(); 2212 fSendNext = fSendUnacknowledged; 2213 _SendQueued(); 2214 } 2215 2216 2217 void 2218 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime) 2219 { 2220 int32 rtt = roundTripTime; 2221 2222 // "smooth" round trip time as per Van Jacobson 2223 rtt -= fRoundTripTime / 8; 2224 fRoundTripTime += rtt; 2225 if (rtt < 0) 2226 rtt = -rtt; 2227 rtt -= fRoundTripDeviation / 4; 2228 fRoundTripDeviation += rtt; 2229 2230 fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2) 2231 * kTimestampFactor; 2232 2233 TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout, 2234 roundTripTime); 2235 } 2236 2237 2238 void 2239 TCPEndpoint::_ResetSlowStart() 2240 { 2241 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2242 2 * fSendMaxSegmentSize); 2243 fCongestionWindow = fSendMaxSegmentSize; 2244 } 2245 2246 2247 // #pragma mark - timer 2248 2249 2250 /*static*/ void 2251 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2252 { 2253 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2254 T(Timer(endpoint, "retransmit")); 2255 2256 MutexLocker locker(endpoint->fLock); 2257 if (!locker.IsLocked()) 2258 return; 2259 2260 endpoint->_Retransmit(); 2261 } 2262 2263 2264 /*static*/ void 2265 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2266 { 2267 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2268 T(Timer(endpoint, "persist")); 2269 2270 MutexLocker locker(endpoint->fLock); 2271 if (!locker.IsLocked()) 2272 return; 2273 2274 // the timer might not have been canceled early enough 2275 if (endpoint->State() == CLOSED) 2276 return; 2277 2278 endpoint->_SendQueued(true); 2279 } 2280 2281 2282 /*static*/ void 2283 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2284 { 2285 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2286 T(Timer(endpoint, "delayed ack")); 2287 2288 MutexLocker locker(endpoint->fLock); 2289 if (!locker.IsLocked()) 2290 return; 2291 2292 // the timer might not have been canceled early enough 2293 if (endpoint->State() == CLOSED) 2294 return; 2295 2296 endpoint->SendAcknowledge(true); 2297 } 2298 2299 2300 /*static*/ void 2301 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2302 { 2303 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2304 T(Timer(endpoint, "time-wait")); 2305 2306 MutexLocker locker(endpoint->fLock); 2307 if (!locker.IsLocked()) 2308 return; 2309 2310 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2311 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2312 return; 2313 } 2314 2315 locker.Unlock(); 2316 2317 gSocketModule->release_socket(endpoint->socket); 2318 } 2319 2320 2321 // #pragma mark - 2322 2323 2324 void 2325 TCPEndpoint::Dump() const 2326 { 2327 kprintf("TCP endpoint %p\n", this); 2328 kprintf(" state: %s\n", name_for_state(fState)); 2329 kprintf(" flags: 0x%lx\n", fFlags); 2330 #if KDEBUG 2331 kprintf(" lock: { %p, holder: %ld }\n", &fLock, fLock.holder); 2332 #endif 2333 kprintf(" accept sem: %ld\n", fAcceptSemaphore); 2334 kprintf(" options: 0x%lx\n", (uint32)fOptions); 2335 kprintf(" send\n"); 2336 kprintf(" window shift: %u\n", fSendWindowShift); 2337 kprintf(" unacknowledged: %lu\n", fSendUnacknowledged.Number()); 2338 kprintf(" next: %lu\n", fSendNext.Number()); 2339 kprintf(" max: %lu\n", fSendMax.Number()); 2340 kprintf(" urgent offset: %lu\n", fSendUrgentOffset.Number()); 2341 kprintf(" window: %lu\n", fSendWindow); 2342 kprintf(" max window: %lu\n", fSendMaxWindow); 2343 kprintf(" max segment size: %lu\n", fSendMaxSegmentSize); 2344 kprintf(" queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size()); 2345 #if DEBUG_BUFFER_QUEUE 2346 fSendQueue.Dump(); 2347 #endif 2348 kprintf(" last acknowledge sent: %lu\n", fLastAcknowledgeSent.Number()); 2349 kprintf(" initial sequence: %lu\n", fInitialSendSequence.Number()); 2350 kprintf(" receive\n"); 2351 kprintf(" window shift: %u\n", fReceiveWindowShift); 2352 kprintf(" next: %lu\n", fReceiveNext.Number()); 2353 kprintf(" max advertised: %lu\n", fReceiveMaxAdvertised.Number()); 2354 kprintf(" window: %lu\n", fReceiveWindow); 2355 kprintf(" max segment size: %lu\n", fReceiveMaxSegmentSize); 2356 kprintf(" queue: %lu / %lu\n", fReceiveQueue.Available(), 2357 fReceiveQueue.Size()); 2358 #if DEBUG_BUFFER_QUEUE 2359 fReceiveQueue.Dump(); 2360 #endif 2361 kprintf(" initial sequence: %lu\n", fInitialReceiveSequence.Number()); 2362 kprintf(" duplicate acknowledge count: %lu\n", 2363 fDuplicateAcknowledgeCount); 2364 kprintf(" round trip time: %ld (deviation %ld)\n", fRoundTripTime, 2365 fRoundTripDeviation); 2366 kprintf(" retransmit timeout: %lld\n", fRetransmitTimeout); 2367 kprintf(" congestion window: %lu\n", fCongestionWindow); 2368 kprintf(" slow start threshold: %lu\n", fSlowStartThreshold); 2369 } 2370 2371